diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml index 3cc43fe7538a0..0652de980e169 100644 --- a/.github/workflows/bot.yml +++ b/.github/workflows/bot.yml @@ -31,5 +31,5 @@ jobs: - name: Build Project env: SCALA_PROFILE: ${{ matrix.scala }} - SPAKR_PROFILE: ${{ matrix.spark }} - run: mvn install -P "$SCALA_PROFILE,$SPAKR_PROFILE" -DskipTests=true -Dmaven.javadoc.skip=true -B -V + SPARK_PROFILE: ${{ matrix.spark }} + run: mvn install -P "$SCALA_PROFILE,$SPARK_PROFILE" -DskipTests=true -Dmaven.javadoc.skip=true -B -V diff --git a/dependencies/hudi-flink-bundle_2.11.txt b/dependencies/hudi-flink-bundle_2.11.txt new file mode 100644 index 0000000000000..7ece1e855e939 --- /dev/null +++ b/dependencies/hudi-flink-bundle_2.11.txt @@ -0,0 +1,294 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +akka-actor_2.11/com.typesafe.akka/2.5.21//akka-actor_2.11-2.5.21.jar +akka-protobuf_2.11/com.typesafe.akka/2.5.21//akka-protobuf_2.11-2.5.21.jar +akka-slf4j_2.11/com.typesafe.akka/2.5.21//akka-slf4j_2.11-2.5.21.jar +akka-stream_2.11/com.typesafe.akka/2.5.21//akka-stream_2.11-2.5.21.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +audience-annotations/org.apache.yetus/0.11.0//audience-annotations-0.11.0.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.10.0//avro-1.10.0.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.7.6//chill-java-0.7.6.jar +chill_2.11/com.twitter/0.7.6//chill_2.11-0.7.6.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.20//commons-compress-1.20.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.0.1//commons-httpclient-3.0.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.5//commons-math3-3.5.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +config/com.typesafe/1.3.3//config-1.3.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/5.0.1//datanucleus-api-jdo-5.0.1.jar +datanucleus-core/org.datanucleus/5.0.1//datanucleus-core-5.0.1.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/6.5.6//fastutil-6.5.6.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flink-annotations/org.apache.flink/1.13.1//flink-annotations-1.13.1.jar +flink-avro/org.apache.flink/1.13.1//flink-avro-1.13.1.jar +flink-clients_2.11/org.apache.flink/1.13.1//flink-clients_2.11-1.13.1.jar +flink-connector-base/org.apache.flink/1.13.1//flink-connector-base-1.13.1.jar +flink-connector-kafka_2.11/org.apache.flink/1.13.1//flink-connector-kafka_2.11-1.13.1.jar +flink-core/org.apache.flink/1.13.1//flink-core-1.13.1.jar +flink-file-sink-common/org.apache.flink/1.13.1//flink-file-sink-common-1.13.1.jar +flink-hadoop-compatibility_2.11/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.11-1.13.1.jar +flink-hadoop-fs/org.apache.flink/1.13.1//flink-hadoop-fs-1.13.1.jar +flink-java/org.apache.flink/1.13.1//flink-java-1.13.1.jar +flink-json/org.apache.flink/1.13.1//flink-json-1.13.1.jar +flink-metrics-core/org.apache.flink/1.13.1//flink-metrics-core-1.13.1.jar +flink-optimizer_2.11/org.apache.flink/1.13.1//flink-optimizer_2.11-1.13.1.jar +flink-parquet_2.11/org.apache.flink/1.13.1//flink-parquet_2.11-1.13.1.jar +flink-queryable-state-client-java/org.apache.flink/1.13.1//flink-queryable-state-client-java-1.13.1.jar +flink-runtime_2.11/org.apache.flink/1.13.1//flink-runtime_2.11-1.13.1.jar +flink-shaded-asm-7/org.apache.flink/7.1-13.0//flink-shaded-asm-7-7.1-13.0.jar +flink-shaded-guava/org.apache.flink/18.0-13.0//flink-shaded-guava-18.0-13.0.jar +flink-shaded-jackson/org.apache.flink/2.12.1-13.0//flink-shaded-jackson-2.12.1-13.0.jar +flink-shaded-netty/org.apache.flink/4.1.49.Final-13.0//flink-shaded-netty-4.1.49.Final-13.0.jar +flink-shaded-zookeeper-3/org.apache.flink/3.4.14-13.0//flink-shaded-zookeeper-3-3.4.14-13.0.jar +flink-streaming-java_2.11/org.apache.flink/1.13.1//flink-streaming-java_2.11-1.13.1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +force-shading/org.apache.flink/1.13.1//force-shading-1.13.1.jar +grizzled-slf4j_2.11/org.clapper/1.3.2//grizzled-slf4j_2.11-1.3.2.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.24.0-GA//javassist-3.24.0-GA.jar +javax.annotation-api/javax.annotation/1.3.2//javax.annotation-api-1.3.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/1.3.9//jsr305-1.3.9.jar +junit/junit/4.12//junit-4.12.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +kryo/com.esotericsoftware.kryo/2.24.0//kryo-2.24.0.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware.minlog/1.2//minlog-1.2.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +parquet-avro/org.apache.parquet/1.11.1//parquet-avro-1.11.1.jar +parquet-column/org.apache.parquet/1.11.1//parquet-column-1.11.1.jar +parquet-common/org.apache.parquet/1.11.1//parquet-common-1.11.1.jar +parquet-encoding/org.apache.parquet/1.11.1//parquet-encoding-1.11.1.jar +parquet-format-structures/org.apache.parquet/1.11.1//parquet-format-structures-1.11.1.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.11.1//parquet-hadoop-1.11.1.jar +parquet-jackson/org.apache.parquet/1.11.1//parquet-jackson-1.11.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +reactive-streams/org.reactivestreams/1.0.2//reactive-streams-1.0.2.jar +scala-java8-compat_2.11/org.scala-lang.modules/0.7.0//scala-java8-compat_2.11-0.7.0.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.1//scala-parser-combinators_2.11-1.1.1.jar +scopt_2.11/com.github.scopt/3.5.0//scopt_2.11-3.5.0.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.22//slf4j-api-1.7.22.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +ssl-config-core_2.11/com.typesafe/0.3.7//ssl-config-core_2.11-0.3.7.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-flink-bundle_2.12.txt b/dependencies/hudi-flink-bundle_2.12.txt new file mode 100644 index 0000000000000..d7566b5d709d8 --- /dev/null +++ b/dependencies/hudi-flink-bundle_2.12.txt @@ -0,0 +1,295 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +akka-actor_2.11/com.typesafe.akka/2.5.21//akka-actor_2.11-2.5.21.jar +akka-protobuf_2.11/com.typesafe.akka/2.5.21//akka-protobuf_2.11-2.5.21.jar +akka-slf4j_2.11/com.typesafe.akka/2.5.21//akka-slf4j_2.11-2.5.21.jar +akka-stream_2.11/com.typesafe.akka/2.5.21//akka-stream_2.11-2.5.21.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +audience-annotations/org.apache.yetus/0.11.0//audience-annotations-0.11.0.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.10.0//avro-1.10.0.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.7.6//chill-java-0.7.6.jar +chill_2.11/com.twitter/0.7.6//chill_2.11-0.7.6.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.20//commons-compress-1.20.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.0.1//commons-httpclient-3.0.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.5//commons-math3-3.5.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +config/com.typesafe/1.3.3//config-1.3.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/5.0.1//datanucleus-api-jdo-5.0.1.jar +datanucleus-core/org.datanucleus/5.0.1//datanucleus-core-5.0.1.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/6.5.6//fastutil-6.5.6.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flink-annotations/org.apache.flink/1.13.1//flink-annotations-1.13.1.jar +flink-avro/org.apache.flink/1.13.1//flink-avro-1.13.1.jar +flink-clients_2.11/org.apache.flink/1.13.1//flink-clients_2.11-1.13.1.jar +flink-connector-base/org.apache.flink/1.13.1//flink-connector-base-1.13.1.jar +flink-connector-kafka_2.11/org.apache.flink/1.13.1//flink-connector-kafka_2.11-1.13.1.jar +flink-core/org.apache.flink/1.13.1//flink-core-1.13.1.jar +flink-file-sink-common/org.apache.flink/1.13.1//flink-file-sink-common-1.13.1.jar +flink-hadoop-compatibility_2.11/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.11-1.13.1.jar +flink-hadoop-compatibility_2.12/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.12-1.13.1.jar +flink-hadoop-fs/org.apache.flink/1.13.1//flink-hadoop-fs-1.13.1.jar +flink-java/org.apache.flink/1.13.1//flink-java-1.13.1.jar +flink-json/org.apache.flink/1.13.1//flink-json-1.13.1.jar +flink-metrics-core/org.apache.flink/1.13.1//flink-metrics-core-1.13.1.jar +flink-optimizer_2.11/org.apache.flink/1.13.1//flink-optimizer_2.11-1.13.1.jar +flink-parquet_2.12/org.apache.flink/1.13.1//flink-parquet_2.12-1.13.1.jar +flink-queryable-state-client-java/org.apache.flink/1.13.1//flink-queryable-state-client-java-1.13.1.jar +flink-runtime_2.11/org.apache.flink/1.13.1//flink-runtime_2.11-1.13.1.jar +flink-shaded-asm-7/org.apache.flink/7.1-13.0//flink-shaded-asm-7-7.1-13.0.jar +flink-shaded-guava/org.apache.flink/18.0-13.0//flink-shaded-guava-18.0-13.0.jar +flink-shaded-jackson/org.apache.flink/2.12.1-13.0//flink-shaded-jackson-2.12.1-13.0.jar +flink-shaded-netty/org.apache.flink/4.1.49.Final-13.0//flink-shaded-netty-4.1.49.Final-13.0.jar +flink-shaded-zookeeper-3/org.apache.flink/3.4.14-13.0//flink-shaded-zookeeper-3-3.4.14-13.0.jar +flink-streaming-java_2.11/org.apache.flink/1.13.1//flink-streaming-java_2.11-1.13.1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +force-shading/org.apache.flink/1.13.1//force-shading-1.13.1.jar +grizzled-slf4j_2.11/org.clapper/1.3.2//grizzled-slf4j_2.11-1.3.2.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.10.0//jackson-annotations-2.10.0.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/com.fasterxml.jackson.core/2.10.0//jackson-databind-2.10.0.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.24.0-GA//javassist-3.24.0-GA.jar +javax.annotation-api/javax.annotation/1.3.2//javax.annotation-api-1.3.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/1.3.9//jsr305-1.3.9.jar +junit/junit/4.12//junit-4.12.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +kryo/com.esotericsoftware.kryo/2.24.0//kryo-2.24.0.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware.minlog/1.2//minlog-1.2.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +parquet-avro/org.apache.parquet/1.11.1//parquet-avro-1.11.1.jar +parquet-column/org.apache.parquet/1.11.1//parquet-column-1.11.1.jar +parquet-common/org.apache.parquet/1.11.1//parquet-common-1.11.1.jar +parquet-encoding/org.apache.parquet/1.11.1//parquet-encoding-1.11.1.jar +parquet-format-structures/org.apache.parquet/1.11.1//parquet-format-structures-1.11.1.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.11.1//parquet-hadoop-1.11.1.jar +parquet-jackson/org.apache.parquet/1.11.1//parquet-jackson-1.11.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +reactive-streams/org.reactivestreams/1.0.2//reactive-streams-1.0.2.jar +scala-java8-compat_2.11/org.scala-lang.modules/0.7.0//scala-java8-compat_2.11-0.7.0.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.1//scala-parser-combinators_2.11-1.1.1.jar +scopt_2.11/com.github.scopt/3.5.0//scopt_2.11-3.5.0.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.22//slf4j-api-1.7.22.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +ssl-config-core_2.11/com.typesafe/0.3.7//ssl-config-core_2.11-0.3.7.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-hadoop-mr-bundle.txt b/dependencies/hudi-hadoop-mr-bundle.txt new file mode 100644 index 0000000000000..bcc26595945a9 --- /dev/null +++ b/dependencies/hudi-hadoop-mr-bundle.txt @@ -0,0 +1,135 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.9//commons-codec-1.9.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javax.inject/javax.inject/1//javax.inject-1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +junit/junit/4.12//junit-4.12.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +servlet-api/javax.servlet/2.5//servlet-api-2.5.jar +slf4j-api/org.slf4j/1.7.7//slf4j-api-1.7.7.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-hive-sync-bundle.txt b/dependencies/hudi-hive-sync-bundle.txt new file mode 100644 index 0000000000000..3c3c23002f61b --- /dev/null +++ b/dependencies/hudi-hive-sync-bundle.txt @@ -0,0 +1,132 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.9//commons-codec-1.9.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javax.inject/javax.inject/1//javax.inject-1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +junit/junit/4.12//junit-4.12.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +slf4j-api/org.slf4j/1.7.7//slf4j-api-1.7.7.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-integ-test-bundle.txt b/dependencies/hudi-integ-test-bundle.txt new file mode 100644 index 0000000000000..77eec2b44d3e7 --- /dev/null +++ b/dependencies/hudi-integ-test-bundle.txt @@ -0,0 +1,345 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +antlr4-runtime/org.antlr/4.7//antlr4-runtime-4.7.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b10//aopalliance-repackaged-2.4.0-b10.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +arrow-format/org.apache.arrow/0.10.0//arrow-format-0.10.0.jar +arrow-memory/org.apache.arrow/0.10.0//arrow-memory-0.10.0.jar +arrow-vector/org.apache.arrow/0.10.0//arrow-vector-0.10.0.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro-ipc/org.apache.avro/1.7.7//avro-ipc-1.7.7.jar +avro-ipc/org.apache.avro/1.7.7/tests/avro-ipc-1.7.7-tests.jar +avro-mapred/org.apache.avro/1.7.7//avro-mapred-1.7.7.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.3//bijection-avro_2.11-0.9.3.jar +bijection-core_2.11/com.twitter/0.9.3//bijection-core_2.11-0.9.3.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.4.1//commons-compress-1.4.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.1.3//commons-logging-1.1.3.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.4//commons-pool-1.4.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flatbuffers/com.vlkan/1.2.0-3f79e055//flatbuffers-1.2.0-3f79e055.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/11.0.2//guava-11.0.2.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3/tests/hadoop-common-2.7.3-tests.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3/tests/hadoop-hdfs-2.7.3-tests.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.1.1//hbase-annotations-1.1.1.jar +hbase-client/org.apache.hbase/1.1.1//hbase-client-1.1.1.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1/standalone/hive-jdbc-2.3.1-standalone.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +hk2-api/org.glassfish.hk2/2.4.0-b10//hk2-api-2.4.0-b10.jar +hk2-locator/org.glassfish.hk2/2.4.0-b10//hk2-locator-2.4.0-b10.jar +hk2-utils/org.glassfish.hk2/2.4.0-b10//hk2-utils-2.4.0-b10.jar +hppc/com.carrotsearch/0.7.2//hppc-0.7.2.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-dataformat-yaml/com.fasterxml.jackson.dataformat/2.7.4//jackson-dataformat-yaml-2.7.4.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.18.1-GA//javassist-3.18.1-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b10//javax.inject-2.4.0-b10.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-apache-connector/org.glassfish.jersey.connectors/2.17//jersey-apache-connector-2.17.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.0//lz4-java-1.4.0.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +mockito-all/org.mockito/1.10.19//mockito-all-1.10.19.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-mapreduce/org.apache.orc/1.5.5/nohive/orc-mapreduce-1.5.5-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.8//paranamer-2.8.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.8//scala-library-2.11.8.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.0//scala-parser-combinators_2.11-1.1.0.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +scalac-scoverage-runtime_2.11/org.scoverage/1.3.0//scalac-scoverage-runtime_2.11-1.3.0.jar +servlet-api/javax.servlet/2.5//servlet-api-2.5.jar +servlet-api/org.mortbay.jetty/2.5-20081211//servlet-api-2.5-20081211.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snakeyaml/org.yaml/1.15//snakeyaml-1.15.jar +snappy-java/org.xerial.snappy/1.1.7.3//snappy-java-1.1.7.3.jar +spark-catalyst_2.11/org.apache.spark/2.4.4//spark-catalyst_2.11-2.4.4.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-sketch_2.11/org.apache.spark/2.4.4//spark-sketch_2.11-2.4.4.jar +spark-sql_2.11/org.apache.spark/2.4.4//spark-sql_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +univocity-parsers/com.univocity/2.7.3//univocity-parsers-2.7.3.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/dependencies/hudi-kafka-connect-bundle.txt b/dependencies/hudi-kafka-connect-bundle.txt new file mode 100644 index 0000000000000..c46a2a10bc024 --- /dev/null +++ b/dependencies/hudi-kafka-connect-bundle.txt @@ -0,0 +1,273 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +akka-actor_2.11/com.typesafe.akka/2.5.21//akka-actor_2.11-2.5.21.jar +akka-protobuf_2.11/com.typesafe.akka/2.5.21//akka-protobuf_2.11-2.5.21.jar +akka-slf4j_2.11/com.typesafe.akka/2.5.21//akka-slf4j_2.11-2.5.21.jar +akka-stream_2.11/com.typesafe.akka/2.5.21//akka-stream_2.11-2.5.21.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +antlr-runtime/org.antlr/3.3//antlr-runtime-3.3.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b34//aopalliance-repackaged-2.4.0-b34.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro-ipc/org.apache.avro/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.3.2//commons-lang3-3.3.2.jar +commons-logging/commons-logging/1.1.3//commons-logging-1.1.3.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +config/com.typesafe/1.3.3//config-1.3.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flink-annotations/org.apache.flink/1.12.1//flink-annotations-1.12.1.jar +flink-clients_2.11/org.apache.flink/1.13.1//flink-clients_2.11-1.13.1.jar +flink-connector-base/org.apache.flink/1.13.1//flink-connector-base-1.13.1.jar +flink-connector-kafka_2.11/org.apache.flink/1.13.1//flink-connector-kafka_2.11-1.13.1.jar +flink-core/org.apache.flink/1.12.1//flink-core-1.12.1.jar +flink-file-sink-common/org.apache.flink/1.13.1//flink-file-sink-common-1.13.1.jar +flink-hadoop-compatibility_2.11/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.11-1.13.1.jar +flink-hadoop-fs/org.apache.flink/1.13.1//flink-hadoop-fs-1.13.1.jar +flink-java/org.apache.flink/1.13.1//flink-java-1.13.1.jar +flink-metrics-core/org.apache.flink/1.12.1//flink-metrics-core-1.12.1.jar +flink-optimizer_2.11/org.apache.flink/1.13.1//flink-optimizer_2.11-1.13.1.jar +flink-queryable-state-client-java/org.apache.flink/1.13.1//flink-queryable-state-client-java-1.13.1.jar +flink-runtime_2.11/org.apache.flink/1.13.1//flink-runtime_2.11-1.13.1.jar +flink-shaded-asm-7/org.apache.flink/7.1-12.0//flink-shaded-asm-7-7.1-12.0.jar +flink-shaded-guava/org.apache.flink/18.0-12.0//flink-shaded-guava-18.0-12.0.jar +flink-shaded-jackson/org.apache.flink/2.12.1-13.0//flink-shaded-jackson-2.12.1-13.0.jar +flink-shaded-netty/org.apache.flink/4.1.49.Final-13.0//flink-shaded-netty-4.1.49.Final-13.0.jar +flink-shaded-zookeeper-3/org.apache.flink/3.4.14-13.0//flink-shaded-zookeeper-3-3.4.14-13.0.jar +flink-streaming-java_2.11/org.apache.flink/1.13.1//flink-streaming-java_2.11-1.13.1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +force-shading/org.apache.flink/1.12.1//force-shading-1.12.1.jar +grizzled-slf4j_2.11/org.clapper/1.3.2//grizzled-slf4j_2.11-1.3.2.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/11.0.2//guava-11.0.2.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +hk2-api/org.glassfish.hk2/2.4.0-b34//hk2-api-2.4.0-b34.jar +hk2-locator/org.glassfish.hk2/2.4.0-b34//hk2-locator-2.4.0-b34.jar +hk2-utils/org.glassfish.hk2/2.4.0-b34//hk2-utils-2.4.0-b34.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.24.0-GA//javassist-3.24.0-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b34//javax.inject-2.4.0-b34.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.5//metrics-json-3.1.5.jar +metrics-jvm/io.dropwizard.metrics/3.1.5//metrics-jvm-3.1.5.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.7.0.Final//netty-3.7.0.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/3.17.3//protobuf-java-3.17.3.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +reactive-streams/org.reactivestreams/1.0.2//reactive-streams-1.0.2.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-java8-compat_2.11/org.scala-lang.modules/0.7.0//scala-java8-compat_2.11-0.7.0.jar +scala-library/org.scala-lang/2.11.8//scala-library-2.11.8.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.1//scala-parser-combinators_2.11-1.1.1.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +scopt_2.11/com.github.scopt/3.5.0//scopt_2.11-3.5.0.jar +servlet-api/javax.servlet/2.5//servlet-api-2.5.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +ssl-config-core_2.11/com.typesafe/0.3.7//ssl-config-core_2.11-0.3.7.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/dependencies/hudi-presto-bundle.txt b/dependencies/hudi-presto-bundle.txt new file mode 100644 index 0000000000000..9cdc2329b4541 --- /dev/null +++ b/dependencies/hudi-presto-bundle.txt @@ -0,0 +1,132 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.9//commons-codec-1.9.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javax.inject/javax.inject/1//javax.inject-1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +jsr305/com.google.code.findbugs/1.3.9//jsr305-1.3.9.jar +junit/junit/4.12//junit-4.12.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +slf4j-api/org.slf4j/1.7.7//slf4j-api-1.7.7.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-spark-bundle_2.11.txt b/dependencies/hudi-spark-bundle_2.11.txt new file mode 100644 index 0000000000000..886a1ea6beb4f --- /dev/null +++ b/dependencies/hudi-spark-bundle_2.11.txt @@ -0,0 +1,259 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.10//slf4j-api-1.7.10.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.2.6//snappy-java-1.1.2.6.jar +spark-avro_2.11/org.apache.spark/2.4.4//spark-avro_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-spark-bundle_2.12.txt b/dependencies/hudi-spark-bundle_2.12.txt new file mode 100644 index 0000000000000..6b94353492d26 --- /dev/null +++ b/dependencies/hudi-spark-bundle_2.12.txt @@ -0,0 +1,259 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.10//slf4j-api-1.7.10.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.2.6//snappy-java-1.1.2.6.jar +spark-avro_2.12/org.apache.spark/2.4.4//spark-avro_2.12-2.4.4.jar +spark-tags_2.12/org.apache.spark/2.4.4//spark-tags_2.12-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-spark3-bundle_2.12.txt b/dependencies/hudi-spark3-bundle_2.12.txt new file mode 100644 index 0000000000000..8193be44397f7 --- /dev/null +++ b/dependencies/hudi-spark3-bundle_2.12.txt @@ -0,0 +1,259 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.10.0//jackson-annotations-2.10.0.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/com.fasterxml.jackson.core/2.10.0//jackson-databind-2.10.0.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.10//slf4j-api-1.7.10.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.2.6//snappy-java-1.1.2.6.jar +spark-avro_2.12/org.apache.spark/3.0.0//spark-avro_2.12-3.0.0.jar +spark-tags_2.12/org.apache.spark/3.0.0//spark-tags_2.12-3.0.0.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-timeline-server-bundle.txt b/dependencies/hudi-timeline-server-bundle.txt new file mode 100644 index 0000000000000..e60d4c215361f --- /dev/null +++ b/dependencies/hudi-timeline-server-bundle.txt @@ -0,0 +1,143 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.4.1//commons-compress-1.4.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/11.0.2//guava-11.0.2.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +slf4j-api/org.slf4j/1.7.26//slf4j-api-1.7.26.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-utilities-bundle_2.11.txt b/dependencies/hudi-utilities-bundle_2.11.txt new file mode 100644 index 0000000000000..9f15b2b58a01a --- /dev/null +++ b/dependencies/hudi-utilities-bundle_2.11.txt @@ -0,0 +1,321 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b34//aopalliance-repackaged-2.4.0-b34.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro-ipc/org.apache.avro/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +hk2-api/org.glassfish.hk2/2.4.0-b34//hk2-api-2.4.0-b34.jar +hk2-locator/org.glassfish.hk2/2.4.0-b34//hk2-locator-2.4.0-b34.jar +hk2-utils/org.glassfish.hk2/2.4.0-b34//hk2-utils-2.4.0-b34.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.18.1-GA//javassist-3.18.1-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b34//javax.inject-2.4.0-b34.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/dependencies/hudi-utilities-bundle_2.12.txt b/dependencies/hudi-utilities-bundle_2.12.txt new file mode 100644 index 0000000000000..802f1af22bebd --- /dev/null +++ b/dependencies/hudi-utilities-bundle_2.12.txt @@ -0,0 +1,321 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b34//aopalliance-repackaged-2.4.0-b34.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro-ipc/org.apache.avro/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +hk2-api/org.glassfish.hk2/2.4.0-b34//hk2-api-2.4.0-b34.jar +hk2-locator/org.glassfish.hk2/2.4.0-b34//hk2-locator-2.4.0-b34.jar +hk2-utils/org.glassfish.hk2/2.4.0-b34//hk2-utils-2.4.0-b34.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.10.0//jackson-annotations-2.10.0.jar +jackson-core-asl/org.codehaus.jackson/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/com.fasterxml.jackson.core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/com.fasterxml.jackson.core/2.10.0//jackson-databind-2.10.0.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-jaxrs/org.codehaus.jackson/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/org.codehaus.jackson/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jackson-xc/org.codehaus.jackson/1.9.13//jackson-xc-1.9.13.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.18.1-GA//javassist-3.18.1-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b34//javax.inject-2.4.0-b34.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml index 3e42d532bd2c3..05790963b1884 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml @@ -33,7 +33,7 @@ services: interval: 30s timeout: 10s retries: 3 - + datanode1: image: apachehudi/hudi-hadoop_2.8.4-datanode:latest container_name: datanode1 @@ -84,7 +84,7 @@ services: - hive-metastore-postgresql:/var/lib/postgresql hostname: hive-metastore-postgresql container_name: hive-metastore-postgresql - + hivemetastore: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest hostname: hivemetastore @@ -221,6 +221,15 @@ services: - ${HUDI_WS}:/var/hoodie/ws command: worker + graphite: + container_name: graphite + hostname: graphite + image: graphiteapp/graphite-statsd + ports: + - 80:80 + - 2003-2004:2003-2004 + - 8126:8126 + adhoc-1: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest hostname: adhoc-1 diff --git a/docker/demo/config/log4j.properties b/docker/demo/config/log4j.properties index 1618bff9c2233..225e62e47fe75 100644 --- a/docker/demo/config/log4j.properties +++ b/docker/demo/config/log4j.properties @@ -21,12 +21,12 @@ log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - # Set the default spark-shell log level to WARN. When running the spark-shell, the # log level for this class is used to overwrite the root logger's log level, so that # the user can have different defaults for the shell and regular Spark apps. log4j.logger.org.apache.spark.repl.Main=WARN - +# Set logging of integration testsuite to INFO level +log4j.logger.org.apache.hudi.integ.testsuite=INFO # Settings to quiet third party logs that are too verbose log4j.logger.org.spark_project.jetty=WARN log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR @@ -35,7 +35,6 @@ log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO log4j.logger.org.apache.parquet=ERROR log4j.logger.parquet=ERROR log4j.logger.org.apache.spark=WARN - # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR \ No newline at end of file diff --git a/docker/demo/config/test-suite/compact-test.properties b/docker/demo/config/test-suite/compact-test.properties new file mode 100644 index 0000000000000..2eca88de3a426 --- /dev/null +++ b/docker/demo/config/test-suite/compact-test.properties @@ -0,0 +1,50 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.compact.inline.max.delta.commits=2 + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.mode=jdbc +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/complex-dag-cow.yaml b/docker/demo/config/test-suite/complex-dag-cow.yaml index acbe287ac22be..3a84b0a0acecd 100644 --- a/docker/demo/config/test-suite/complex-dag-cow.yaml +++ b/docker/demo/config/test-suite/complex-dag-cow.yaml @@ -49,7 +49,7 @@ dag_content: deps: third_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: @@ -76,7 +76,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/cow-clustering-example.yaml b/docker/demo/config/test-suite/cow-clustering-example.yaml index 939e16f55a54a..95932317c04fd 100644 --- a/docker/demo/config/test-suite/cow-clustering-example.yaml +++ b/docker/demo/config/test-suite/cow-clustering-example.yaml @@ -55,7 +55,7 @@ dag_content: deps: first_delete first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_cluster: @@ -71,6 +71,6 @@ dag_content: deps: first_cluster second_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/cow-long-running-example.yaml b/docker/demo/config/test-suite/cow-long-running-example.yaml index 71a34f8139a72..29b6858bf0506 100644 --- a/docker/demo/config/test-suite/cow-long-running-example.yaml +++ b/docker/demo/config/test-suite/cow-long-running-example.yaml @@ -49,7 +49,7 @@ dag_content: deps: third_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: @@ -76,7 +76,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml b/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml index b071c46675a39..0ce529805567b 100644 --- a/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml +++ b/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml @@ -49,7 +49,7 @@ dag_content: deps: third_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: @@ -76,7 +76,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/mor-async-compact.yaml b/docker/demo/config/test-suite/mor-async-compact.yaml new file mode 100644 index 0000000000000..4ee9c535ebce3 --- /dev/null +++ b/docker/demo/config/test-suite/mor-async-compact.yaml @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Use compact-test.properties for this yaml file. +dag_name: mor-async-compact.yaml +dag_rounds: 4 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + type: InsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_insert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_upsert + third_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: second_upsert + first_validate: + config: + delete_input_data: false + type: ValidateDatasetNode + deps: third_upsert + first_schedule_compact: + config: + type: ScheduleCompactNode + deps: first_validate + fourth_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_schedule_compact + fifth_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: fourth_upsert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + type: InsertNode + deps: fifth_upsert + sixth_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: second_insert + third_validate: + config: + delete_input_data: false + type: ValidateDatasetNode + deps: sixth_upsert + first_compact: + config: + type: CompactNode + deps: third_validate + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 500 + type: DeleteNode + deps: first_compact + fifth_validate: + config: + delete_input_data: false + type: ValidateDatasetNode + deps: first_delete \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-cow-ctas.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-cow-ctas.yaml new file mode 100644 index 0000000000000..376d2a540b3f7 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-cow-ctas.yaml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-nonpartitioned-managed-cow-ctas.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + is_external: true + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + use_ctas: true + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-mor.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-mor.yaml new file mode 100644 index 0000000000000..1899830c6fc16 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-mor.yaml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: sspark-sql-nonpartitioned-external-mor.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: mor + is_external: true + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow-ctas.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow-ctas.yaml new file mode 100644 index 0000000000000..8659a90470188 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow-ctas.yaml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-nonpartitioned-managed-cow-ctas.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + use_ctas: true + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow.yaml new file mode 100644 index 0000000000000..79ea448d0433a --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow.yaml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-nonpartitioned-managed-cow.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + #merge_records: + # config: + # merge_condition: target._row_key = source._row_key + # matched_action: update set * + # not_matched_action: insert * + # record_size: 1000 + # num_partitions_insert: 10 + # repeat_count: 1 + # num_records_upsert: 100 + # num_records_insert: 1000 + # type: spark.sql.SparkSqlMergeNode + # deps: insert_records + delete_records: + config: + condition_column: begin_lat + record_size: 1000 + repeat_count: 1 + ratio_records_change: 0.2 + type: spark.sql.SparkSqlDeleteNode + deps: insert_records + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: delete_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-partition-cow-updates.yaml b/docker/demo/config/test-suite/spark-sql-partition-cow-updates.yaml new file mode 100644 index 0000000000000..a4b52559a3375 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-partition-cow-updates.yaml @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-partitioned-managed-cow.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + first_validate: + config: + delete_input_data: false + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records + update_records: + config: + type: spark.sql.SparkSqlUpdateNode + deps: first_validate + delete_records: + config: + condition_column: begin_lat + record_size: 1000 + repeat_count: 1 + ratio_records_change: 0.2 + type: spark.sql.SparkSqlDeleteNode + deps: update_records + second_validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: delete_records diff --git a/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow-ctas.yaml b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow-ctas.yaml new file mode 100644 index 0000000000000..da0f512315c3b --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow-ctas.yaml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-partitioned-managed-cow-ctas.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + use_ctas: true + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow.yaml b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow.yaml new file mode 100644 index 0000000000000..cb75949552d6e --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow.yaml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-partitioned-managed-cow.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + #merge_records: + # config: + # merge_condition: target._row_key = source._row_key + # matched_action: update set * + # not_matched_action: insert * + # record_size: 1000 + # num_partitions_insert: 10 + # repeat_count: 1 + # num_records_upsert: 100 + # num_records_insert: 1000 + # type: spark.sql.SparkSqlMergeNode + # deps: insert_records + delete_records: + config: + condition_column: begin_lat + record_size: 1000 + repeat_count: 1 + ratio_records_change: 0.2 + type: spark.sql.SparkSqlDeleteNode + deps: insert_records + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: delete_records diff --git a/docker/demo/config/test-suite/test.properties b/docker/demo/config/test-suite/test.properties index b4f69d9cb4f8e..30cd1c1f02f09 100644 --- a/docker/demo/config/test-suite/test.properties +++ b/docker/demo/config/test-suite/test.properties @@ -25,10 +25,6 @@ hoodie.deltastreamer.source.test.max_unique_records=100000000 hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 - hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.datasource.hive_sync.skip_ro_suffix=true diff --git a/docker/generate_test_suite.sh b/docker/generate_test_suite.sh index d7c1405630f0a..48c876fa6184c 100755 --- a/docker/generate_test_suite.sh +++ b/docker/generate_test_suite.sh @@ -16,6 +16,37 @@ # See the License for the specific language governing permissions and # limitations under the License. +usage=" +USAGE: +$(basename "$0") [--help] [--all boolen] -- Script to generate the test suite according to arguments provided and run these test suites. + +where: + --help show this help text + --all set the seed value + --execute_test_suite flag if test need to execute (DEFAULT- true) + --medium_num_iterations number of medium iterations (DEFAULT- 20) + --long_num_iterations number of long iterations (DEFAULT- 30) + --intermittent_delay_mins delay after every test run (DEFAULT- 1) + --table_type hoodie table type to test (DEFAULT COPY_ON_WRITE) + --include_long_test_suite_yaml include long infra test suite (DEFAULT false) + --include_medium_test_suite_yaml include medium infra test suite (DEFAULT false) + --cluster_num_itr number of cluster iterations (DEFAULT 30) + --include_cluster_yaml include cluster infra test suite (DEFAULT false) + --input_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/input/) + --output_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/output/) + +Example: +Note - Execute the command from within docker folder + + 1. To generate and run all test suites + ./generate_test_suite.sh --all true + 2. To only generate test suites + ./generate_test_suite.sh --all --execute_test_suite false + 3. To run only specific test suite yaml + ./generate_test_suite.sh --execute_test_suite true --include_medium_test_suite_yaml true + " + + MEDIUM_NUM_ITR=20 LONG_NUM_ITR=50 DELAY_MINS=1 @@ -39,6 +70,17 @@ do key="$1" case $key in + --help) + echo "$usage" + exit + ;; + --all) + INCLUDE_LONG_TEST_SUITE="$2" + INCLUDE_MEDIUM_TEST_SUITE="$2" + INCLUDE_CLUSTER_YAML="$2" + shift # past argument + shift # past value + ;; --execute_test_suite) EXECUTE_TEST_SUITE="$2" shift # past argument @@ -115,12 +157,15 @@ case $key in ;; *) # unknown option POSITIONAL+=("$1") # save it in an array for later + echo "Unknown argument provided - '$1'" + echo "$usage" + exit 0 shift # past argument ;; esac done set -- "${POSITIONAL[@]}" # restore positional parameters - +echo "$POSITIONAL" echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE" if $INCLUDE_MEDIUM_TEST_SUITE ; then echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}" @@ -232,7 +277,7 @@ fi if $EXECUTE_TEST_SUITE ; then - docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME adhoc-2:/opt/ + docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/"$JAR_NAME" adhoc-2:/opt/ docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging* docker cp demo/config/test-suite/staging/ adhoc-2:/opt/ docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== " diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java index 8bd842c825659..119ccb0dcf039 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java @@ -181,7 +181,7 @@ private int copyNonArchivedInstants(List instants, int limit, Str final HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); final HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); for (HoodieInstant instant : instants) { - String localPath = localFolder + File.separator + instant.getFileName(); + String localPath = localFolder + Path.SEPARATOR + instant.getFileName(); byte[] data = null; switch (instant.getAction()) { diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java index d0678fc8579d3..a506c8030a557 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java @@ -119,7 +119,7 @@ public String showAllFileSlices( @CliCommand(value = "show fsview latest", help = "Show latest file-system view") public String showLatestFileSlices( - @CliOption(key = {"partitionPath"}, help = "A valid paritition path", mandatory = true) String partition, + @CliOption(key = {"partitionPath"}, help = "A valid partition path", mandatory = true) String partition, @CliOption(key = {"baseFileOnly"}, help = "Only display base file view", unspecifiedDefaultValue = "false") boolean baseFileOnly, @CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed", diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java index 66c2eb02159e4..084d757f85250 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java @@ -89,7 +89,7 @@ public String validateSync( } private String getString(HoodieTableMetaClient target, HoodieTimeline targetTimeline, HoodieTableMetaClient source, long sourceCount, long targetCount, String sourceLatestCommit) - throws IOException { + throws IOException { List commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) .getInstants().collect(Collectors.toList()); if (commitsToCatchup.isEmpty()) { diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java index 433e9df4cad3e..127cb28ad0101 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java @@ -28,6 +28,7 @@ import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.CleanerUtils; import org.apache.hudi.exception.HoodieIOException; @@ -186,11 +187,11 @@ public void removeCorruptedPendingCleanAction() { CleanerUtils.getCleanerPlan(client, instant); } catch (AvroRuntimeException e) { LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant); - FSUtils.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); + HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); } catch (IOException ioe) { if (ioe.getMessage().contains("Not an Avro data file")) { LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant); - FSUtils.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); + HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); } else { throw new HoodieIOException(ioe.getMessage(), ioe); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java index f86937d234989..82688fecc3663 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java @@ -38,7 +38,8 @@ import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.constant.KeyGeneratorType; import org.apache.hudi.table.action.compact.strategy.UnBoundedCompactionStrategy; -import org.apache.hudi.table.upgrade.SparkUpgradeDowngrade; +import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; import org.apache.hudi.utilities.HDFSParquetImporter; import org.apache.hudi.utilities.HDFSParquetImporter.Config; import org.apache.hudi.utilities.HoodieCleaner; @@ -453,7 +454,8 @@ protected static int upgradeOrDowngradeTable(JavaSparkContext jsc, String basePa .setLoadActiveTimelineOnLoad(false).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))).build(); try { - new SparkUpgradeDowngrade(metaClient, config, new HoodieSparkEngineContext(jsc)).run(metaClient, HoodieTableVersion.valueOf(toVersion), config, new HoodieSparkEngineContext(jsc), null); + new UpgradeDowngrade(metaClient, config, new HoodieSparkEngineContext(jsc), SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.valueOf(toVersion), null); LOG.info(String.format("Table at \"%s\" upgraded / downgraded to version \"%s\".", basePath, toVersion)); return 0; } catch (Exception e) { diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java index 5a1c457b10ef1..5f08f0097a451 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java @@ -51,7 +51,7 @@ public static long countNewRecords(HoodieTableMetaClient target, List co public static String getTimeDaysAgo(int numberOfDays) { Date date = Date.from(ZonedDateTime.now().minusDays(numberOfDays).toInstant()); - return HoodieActiveTimeline.COMMIT_FORMATTER.format(date); + return HoodieActiveTimeline.formatInstantTime(date); } /** @@ -61,8 +61,8 @@ public static String getTimeDaysAgo(int numberOfDays) { * b) hours: -1, returns 20200202010000 */ public static String addHours(String compactionCommitTime, int hours) throws ParseException { - Instant instant = HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).toInstant(); + Instant instant = HoodieActiveTimeline.parseInstantTime(compactionCommitTime).toInstant(); ZonedDateTime commitDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault()); - return HoodieActiveTimeline.COMMIT_FORMATTER.format(Date.from(commitDateTime.plusHours(hours).toInstant())); + return HoodieActiveTimeline.formatInstantTime(Date.from(commitDateTime.plusHours(hours).toInstant())); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java index 791f4c21cf101..9732ce72b913d 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java @@ -24,6 +24,7 @@ import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; import org.apache.hudi.cli.testutils.HoodieTestCommitUtilities; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -71,6 +72,7 @@ public void init() throws Exception { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) .withSchema(HoodieTestCommitMetadataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .forTable("test-trip-table").build(); // Create six commits diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java index cdf642799438d..d71e7ec8d987d 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java @@ -25,6 +25,7 @@ import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; import org.apache.hudi.cli.testutils.HoodieTestReplaceCommitMetadataGenerator; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -208,6 +209,7 @@ public void testShowArchivedCommits() throws Exception { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath1) .withSchema(HoodieTestCommitMetadataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .forTable("test-trip-table").build(); // generate data and metadata diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java index e6d1dee89aa4b..de305f404455b 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java @@ -24,6 +24,7 @@ import org.apache.hudi.cli.TableHeader; import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -158,6 +159,7 @@ private void generateArchive() throws IOException { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) .withSchema(HoodieTestCommitMetadataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .forTable("test-trip-table").build(); // archive HoodieTableMetaClient metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java index 6b2bec4efcaab..f2571ce3598d6 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java @@ -53,7 +53,6 @@ import org.junit.jupiter.api.Test; import org.springframework.shell.core.CommandResult; -import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Files; @@ -174,7 +173,7 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc // write to path '2015/03/16'. Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); - partitionPath = tablePath + File.separator + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH; + partitionPath = tablePath + Path.SEPARATOR + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH; Files.createDirectories(Paths.get(partitionPath)); HoodieLogFormat.Writer writer = null; diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java index 83deb34bf5a94..08cdb7dc47f09 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java @@ -142,7 +142,7 @@ public void testCreateWithSpecifiedValues() { assertTrue(cr.isSuccess()); assertEquals("Metadata for table " + tableName + " loaded", cr.getResult().toString()); HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); - assertEquals(metaPath + File.separator + "archive", client.getArchivePath()); + assertEquals(metaPath + Path.SEPARATOR + "archive", client.getArchivePath()); assertEquals(tablePath, client.getBasePath()); assertEquals(metaPath, client.getMetaPath()); assertEquals(HoodieTableType.MERGE_ON_READ, client.getTableType()); @@ -181,7 +181,7 @@ public void testRefresh() throws IOException { private void testRefreshCommand(String command) throws IOException { // clean table matedata FileSystem fs = FileSystem.get(hadoopConf()); - fs.delete(new Path(tablePath + File.separator + HoodieTableMetaClient.METAFOLDER_NAME), true); + fs.delete(new Path(tablePath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME), true); // Create table assertTrue(prepareTable()); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java index c9ddd741a9d5c..6b3e3045507f1 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java @@ -18,6 +18,7 @@ package org.apache.hudi.cli.integ; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.commands.TableCommand; @@ -32,7 +33,6 @@ import org.junit.jupiter.api.Test; import org.springframework.shell.core.CommandResult; -import java.io.File; import java.io.IOException; import java.time.Instant; import java.util.Arrays; @@ -59,8 +59,8 @@ public class ITTestBootstrapCommand extends AbstractShellIntegrationTest { public void init() { String srcName = "source"; tableName = "test-table"; - sourcePath = basePath + File.separator + srcName; - tablePath = basePath + File.separator + tableName; + sourcePath = basePath + Path.SEPARATOR + srcName; + tablePath = basePath + Path.SEPARATOR + tableName; // generate test data partitions = Arrays.asList("2018", "2019", "2020"); @@ -68,7 +68,7 @@ public void init() { for (int i = 0; i < partitions.size(); i++) { Dataset df = TestBootstrap.generateTestRawTripDataset(timestamp, i * NUM_OF_RECORDS, i * NUM_OF_RECORDS + NUM_OF_RECORDS, null, jsc, sqlContext); - df.write().parquet(sourcePath + File.separator + PARTITION_FIELD + "=" + partitions.get(i)); + df.write().parquet(sourcePath + Path.SEPARATOR + PARTITION_FIELD + "=" + partitions.get(i)); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java index 53e543a5d0568..8cdc4c891084d 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java @@ -40,7 +40,6 @@ import org.junit.jupiter.api.Test; import org.springframework.shell.core.CommandResult; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; @@ -70,7 +69,7 @@ public class ITTestHDFSParquetImportCommand extends AbstractShellIntegrationTest @BeforeEach public void init() throws IOException, ParseException { tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + tablePath = basePath + Path.SEPARATOR + tableName; sourcePath = new Path(basePath, "source"); targetPath = new Path(tablePath); schemaFile = new Path(basePath, "file.schema").toString(); @@ -101,7 +100,7 @@ public void testConvertWithInsert() throws IOException { () -> assertEquals("Table imported to hoodie format", cr.getResult().toString())); // Check hudi table exist - String metaPath = targetPath + File.separator + HoodieTableMetaClient.METAFOLDER_NAME; + String metaPath = targetPath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME; assertTrue(Files.exists(Paths.get(metaPath)), "Hoodie table not exist."); // Load meta data diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java index e93323942b0d8..5f8021ab5d7d2 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java @@ -18,6 +18,7 @@ package org.apache.hudi.cli.integ; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.commands.TableCommand; import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; @@ -32,7 +33,6 @@ import org.junit.jupiter.api.Test; import org.springframework.shell.core.CommandResult; -import java.io.File; import java.io.IOException; import static org.junit.jupiter.api.Assertions.assertAll; @@ -53,7 +53,7 @@ public class ITTestSavepointsCommand extends AbstractShellIntegrationTest { @BeforeEach public void init() throws IOException { String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + tablePath = basePath + Path.SEPARATOR + tableName; // Create table and connect new TableCommand().createTable( diff --git a/hudi-client/hudi-client-common/pom.xml b/hudi-client/hudi-client-common/pom.xml index e2b0eb8b25b5b..c67621fbb9a35 100644 --- a/hudi-client/hudi-client-common/pom.xml +++ b/hudi-client/hudi-client-common/pom.xml @@ -110,6 +110,7 @@ org.apache.hadoop hadoop-hdfs tests + test @@ -130,6 +131,7 @@ org.apache.hadoop hadoop-common tests + test org.mortbay.jetty diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java index dfb2fc8fc1c22..444eae62b2ec4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java @@ -24,6 +24,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.callback.HoodieWriteCommitCallback; import org.apache.hudi.callback.common.HoodieWriteCommitCallbackMessage; import org.apache.hudi.callback.util.HoodieCommitCallbackFactory; @@ -60,6 +61,7 @@ import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.exception.HoodieSavepointException; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metrics.HoodieMetrics; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; @@ -99,7 +101,7 @@ public abstract class AbstractHoodieWriteClient index; + private final transient HoodieIndex index; protected transient Timer.Context writeTimer = null; protected transient Timer.Context compactionTimer; @@ -136,7 +138,7 @@ public AbstractHoodieWriteClient(HoodieEngineContext context, HoodieWriteConfig this.txnManager = new TransactionManager(config, fs); } - protected abstract HoodieIndex createIndex(HoodieWriteConfig writeConfig); + protected abstract HoodieIndex createIndex(HoodieWriteConfig writeConfig); public void setOperationType(WriteOperationType operationType) { this.operationType = operationType; @@ -182,11 +184,12 @@ public boolean commitStats(String instantTime, List stats, Opti HoodieTable table = createTable(config, hadoopConf); HoodieCommitMetadata metadata = CommitUtils.buildMetadata(stats, partitionToReplaceFileIds, extraMetadata, operationType, config.getWriteSchema(), commitActionType); + HoodieInstant inflightInstant = new HoodieInstant(State.INFLIGHT, table.getMetaClient().getCommitActionType(), instantTime); HeartbeatUtils.abortIfHeartbeatExpired(instantTime, table, heartbeatClient, config); - this.txnManager.beginTransaction(Option.of(new HoodieInstant(State.INFLIGHT, table.getMetaClient().getCommitActionType(), instantTime)), + this.txnManager.beginTransaction(Option.of(inflightInstant), lastCompletedTxnAndMetadata.isPresent() ? Option.of(lastCompletedTxnAndMetadata.get().getLeft()) : Option.empty()); try { - preCommit(instantTime, metadata); + preCommit(inflightInstant, metadata); commit(table, commitActionType, instantTime, metadata, stats); postCommit(table, metadata, instantTime, extraMetadata); LOG.info("Committed " + instantTime); @@ -219,14 +222,18 @@ protected void commit(HoodieTable table, String commitActionType, String instant Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); } - protected abstract HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf); + protected HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf) { + return createTable(config, hadoopConf, false); + } + + protected abstract HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf, boolean refreshTimeline); void emitCommitMetrics(String instantTime, HoodieCommitMetadata metadata, String actionType) { try { if (writeTimer != null) { long durationInMs = metrics.getDurationInMs(writeTimer.stop()); - metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(instantTime).getTime(), durationInMs, + metrics.updateCommitMetrics(HoodieActiveTimeline.parseInstantTime(instantTime).getTime(), durationInMs, metadata, actionType); writeTimer = null; } @@ -236,13 +243,17 @@ void emitCommitMetrics(String instantTime, HoodieCommitMetadata metadata, String } } - protected void preCommit(String instantTime, HoodieCommitMetadata metadata) { - // no-op - // TODO : Conflict resolution is not supported for Flink & Java engines - } - - protected void syncTableMetadata() { - // no-op + /** + * Any pre-commit actions like conflict resolution or updating metadata table goes here. + * @param inflightInstant instant of inflight operation. + * @param metadata commit metadata for which pre commit is being invoked. + */ + protected void preCommit(HoodieInstant inflightInstant, HoodieCommitMetadata metadata) { + // Create a Hoodie table after starting the transaction which encapsulated the commits and files visible. + // Important to create this after the lock to ensure latest commits show up in the timeline without need for reload + HoodieTable table = createTable(config, hadoopConf); + table.getMetadataWriter().ifPresent(w -> ((HoodieTableMetadataWriter)w).update(metadata, inflightInstant.getTimestamp(), + table.isTableServiceAction(inflightInstant.getAction()))); } /** @@ -271,7 +282,7 @@ public void bootstrap(Option> extraMetadata) { */ public void rollbackFailedBootstrap() { LOG.info("Rolling back pending bootstrap if present"); - HoodieTable table = createTable(config, hadoopConf); + HoodieTable table = createTable(config, hadoopConf, config.isMetadataTableEnabled()); HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); Option instant = Option.fromJavaOptional( inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp).findFirst()); @@ -399,16 +410,6 @@ protected void preWrite(String instantTime, WriteOperationType writeOperationTyp HoodieTableMetaClient metaClient) { setOperationType(writeOperationType); this.lastCompletedTxnAndMetadata = TransactionUtils.getLastCompletedTxnInstantAndMetadata(metaClient); - this.txnManager.beginTransaction(Option.of(new HoodieInstant(State.INFLIGHT, metaClient.getCommitActionType(), instantTime)), lastCompletedTxnAndMetadata - .isPresent() - ? Option.of(lastCompletedTxnAndMetadata.get().getLeft()) : Option.empty()); - try { - if (writeOperationType != WriteOperationType.CLUSTER && writeOperationType != WriteOperationType.COMPACT) { - syncTableMetadata(); - } - } finally { - this.txnManager.endTransaction(); - } this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this); } @@ -434,13 +435,10 @@ protected void postCommit(HoodieTable table, HoodieCommitMetadata me // Delete the marker directory for the instant. WriteMarkersFactory.get(config.getMarkersType(), table, instantTime) .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + autoCleanOnCommit(); // We cannot have unbounded commit files. Archive commits if we have to archive HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(config, table); archiveLog.archiveIfRequired(context); - autoCleanOnCommit(); - if (operationType != null && operationType != WriteOperationType.CLUSTER && operationType != WriteOperationType.COMPACT) { - syncTableMetadata(); - } } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } finally { @@ -450,6 +448,9 @@ protected void postCommit(HoodieTable table, HoodieCommitMetadata me protected void runTableServicesInline(HoodieTable table, HoodieCommitMetadata metadata, Option> extraMetadata) { if (config.inlineTableServices()) { + if (config.isMetadataTableEnabled()) { + table.getHoodieView().sync(); + } // Do an inline compaction if enabled if (config.inlineCompactionEnabled()) { runAnyPendingCompactions(table); @@ -502,7 +503,7 @@ protected void autoCleanOnCommit() { } else { // Do not reuse instantTime for clean as metadata table requires all changes to have unique instant timestamps. LOG.info("Auto cleaning is enabled. Running cleaner now"); - clean(); + clean(true); } } } @@ -514,7 +515,7 @@ protected void autoCleanOnCommit() { * @param comment - Comment for the savepoint */ public void savepoint(String user, String comment) { - HoodieTable table = createTable(config, hadoopConf); + HoodieTable table = createTable(config, hadoopConf, config.isMetadataTableEnabled()); if (table.getCompletedCommitsTimeline().empty()) { throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty"); } @@ -538,7 +539,7 @@ public void savepoint(String user, String comment) { * @param comment - Comment for the savepoint */ public void savepoint(String instantTime, String user, String comment) { - HoodieTable table = createTable(config, hadoopConf); + HoodieTable table = createTable(config, hadoopConf, config.isMetadataTableEnabled()); table.savepoint(context, instantTime, user, comment); } @@ -550,7 +551,7 @@ public void savepoint(String instantTime, String user, String comment) { * @return true if the savepoint was deleted successfully */ public void deleteSavepoint(String savepointTime) { - HoodieTable table = createTable(config, hadoopConf); + HoodieTable table = createTable(config, hadoopConf, config.isMetadataTableEnabled()); SavepointHelpers.deleteSavepoint(table, savepointTime); } @@ -565,22 +566,28 @@ public void deleteSavepoint(String savepointTime) { * @return true if the savepoint was restored to successfully */ public void restoreToSavepoint(String savepointTime) { - HoodieTable table = createTable(config, hadoopConf); + HoodieTable table = createTable(config, hadoopConf, config.isMetadataTableEnabled()); SavepointHelpers.validateSavepointPresence(table, savepointTime); restoreToInstant(savepointTime); SavepointHelpers.validateSavepointRestore(table, savepointTime); } + @Deprecated + public boolean rollback(final String commitInstantTime) throws HoodieRollbackException { + return rollback(commitInstantTime, false); + } + /** * @Deprecated * Rollback the inflight record changes with the given commit time. This * will be removed in future in favor of {@link AbstractHoodieWriteClient#restoreToInstant(String)} * * @param commitInstantTime Instant time of the commit + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. * @throws HoodieRollbackException if rollback cannot be performed successfully */ @Deprecated - public boolean rollback(final String commitInstantTime) throws HoodieRollbackException { + public boolean rollback(final String commitInstantTime, boolean skipLocking) throws HoodieRollbackException { LOG.info("Begin rollback of instant " + commitInstantTime); final String rollbackInstantTime = HoodieActiveTimeline.createNewInstantTime(); final Timer.Context timerContext = this.metrics.getRollbackCtx(); @@ -590,12 +597,21 @@ public boolean rollback(final String commitInstantTime) throws HoodieRollbackExc .filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)) .findFirst()); if (commitInstantOpt.isPresent()) { - HoodieRollbackMetadata rollbackMetadata = table.rollback(context, rollbackInstantTime, commitInstantOpt.get(), true); - if (timerContext != null) { - long durationInMs = metrics.getDurationInMs(timerContext.stop()); - metrics.updateRollbackMetrics(durationInMs, rollbackMetadata.getTotalFilesDeleted()); + LOG.info("Scheduling Rollback at instant time :" + rollbackInstantTime); + Option rollbackPlanOption = table.scheduleRollback(context, rollbackInstantTime, + commitInstantOpt.get(), false); + if (rollbackPlanOption.isPresent()) { + // execute rollback + HoodieRollbackMetadata rollbackMetadata = table.rollback(context, rollbackInstantTime, commitInstantOpt.get(), true, + skipLocking); + if (timerContext != null) { + long durationInMs = metrics.getDurationInMs(timerContext.stop()); + metrics.updateRollbackMetrics(durationInMs, rollbackMetadata.getTotalFilesDeleted()); + } + return true; + } else { + throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commitInstantTime); } - return true; } else { LOG.warn("Cannot find instant " + commitInstantTime + " in the timeline, for rollback"); return false; @@ -616,7 +632,7 @@ public HoodieRestoreMetadata restoreToInstant(final String instantTime) throws H final String restoreInstantTime = HoodieActiveTimeline.createNewInstantTime(); Timer.Context timerContext = metrics.getRollbackCtx(); try { - HoodieTable table = createTable(config, hadoopConf); + HoodieTable table = createTable(config, hadoopConf, config.isMetadataTableEnabled()); HoodieRestoreMetadata restoreMetadata = table.restore(context, restoreInstantTime, instantTime); if (timerContext != null) { final long durationInMs = metrics.getDurationInMs(timerContext.stop()); @@ -638,7 +654,19 @@ public HoodieRestoreMetadata restoreToInstant(final String instantTime) throws H * cleaned) */ public HoodieCleanMetadata clean(String cleanInstantTime) throws HoodieIOException { - return clean(cleanInstantTime, true); + return clean(cleanInstantTime, true, false); + } + + /** + * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be + * cleaned) + * @param cleanInstantTime instant time for clean. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + * @return instance of {@link HoodieCleanMetadata}. + */ + public HoodieCleanMetadata clean(String cleanInstantTime, boolean skipLocking) throws HoodieIOException { + return clean(cleanInstantTime, true, skipLocking); } /** @@ -647,8 +675,11 @@ public HoodieCleanMetadata clean(String cleanInstantTime) throws HoodieIOExcepti * cleaned). This API provides the flexibility to schedule clean instant asynchronously via * {@link AbstractHoodieWriteClient#scheduleTableService(String, Option, TableServiceType)} and disable inline scheduling * of clean. + * @param cleanInstantTime instant time for clean. + * @param scheduleInline true if needs to be scheduled inline. false otherwise. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. */ - public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline) throws HoodieIOException { + public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline, boolean skipLocking) throws HoodieIOException { if (scheduleInline) { scheduleTableServiceInternal(cleanInstantTime, Option.empty(), TableServiceType.CLEAN); } @@ -656,8 +687,8 @@ public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline final Timer.Context timerContext = metrics.getCleanCtx(); LOG.info("Cleaned failed attempts if any"); CleanerUtils.rollbackFailedWrites(config.getFailedWritesCleanPolicy(), - HoodieTimeline.CLEAN_ACTION, () -> rollbackFailedWrites()); - HoodieCleanMetadata metadata = createTable(config, hadoopConf).clean(context, cleanInstantTime); + HoodieTimeline.CLEAN_ACTION, () -> rollbackFailedWrites(skipLocking)); + HoodieCleanMetadata metadata = createTable(config, hadoopConf).clean(context, cleanInstantTime, skipLocking); if (timerContext != null && metadata != null) { long durationMs = metrics.getDurationInMs(timerContext.stop()); metrics.updateCleanMetrics(durationMs, metadata.getTotalFilesDeleted()); @@ -669,7 +700,17 @@ public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline } public HoodieCleanMetadata clean() { - return clean(HoodieActiveTimeline.createNewInstantTime()); + return clean(false); + } + + /** + * Triggers clean for the table. This refers to Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * * configurations and CleaningPolicy used. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + * @return instance of {@link HoodieCleanMetadata}. + */ + public HoodieCleanMetadata clean(boolean skipLocking) { + return clean(HoodieActiveTimeline.createNewInstantTime(), skipLocking); } /** @@ -769,17 +810,6 @@ public abstract void commitCompaction(String compactionInstantTime, O writeStatu protected abstract void completeCompaction(HoodieCommitMetadata metadata, O writeStatuses, HoodieTable table, String compactionCommitTime); - /** - * Rollback failed compactions. Inflight rollbacks for compactions revert the .inflight file to the .requested file - * TODO : Deprecate this method and make it protected - * @param inflightInstant Inflight Compaction Instant - * @param table Hoodie Table - */ - public void rollbackInflightCompaction(HoodieInstant inflightInstant, HoodieTable table) { - table.rollback(context, HoodieActiveTimeline.createNewInstantTime(), inflightInstant, false); - table.getActiveTimeline().revertCompactionInflightToRequested(inflightInstant); - } - /** * Get inflight time line exclude compaction and clustering. * @param metaClient @@ -802,20 +832,29 @@ private HoodieTimeline getInflightTimelineExcludeCompactionAndClustering(HoodieT * Rollback all failed writes. */ public Boolean rollbackFailedWrites() { + return rollbackFailedWrites(false); + } + + /** + * Rollback all failed writes. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + */ + public Boolean rollbackFailedWrites(boolean skipLocking) { HoodieTable table = createTable(config, hadoopConf); - List instantsToRollback = getInstantsToRollback(table.getMetaClient(), config.getFailedWritesCleanPolicy()); - rollbackFailedWrites(instantsToRollback); + List instantsToRollback = getInstantsToRollback(table.getMetaClient(), config.getFailedWritesCleanPolicy(), + Option.empty()); + rollbackFailedWrites(instantsToRollback, skipLocking); return true; } - protected void rollbackFailedWrites(List instantsToRollback) { + protected void rollbackFailedWrites(List instantsToRollback, boolean skipLocking) { for (String instant : instantsToRollback) { if (HoodieTimeline.compareTimestamps(instant, HoodieTimeline.LESSER_THAN_OR_EQUALS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) { rollbackFailedBootstrap(); break; } else { - rollback(instant); + rollback(instant, skipLocking); } } // Delete any heartbeat files for already rolled back commits @@ -827,11 +866,17 @@ protected void rollbackFailedWrites(List instantsToRollback) { } } - protected List getInstantsToRollback(HoodieTableMetaClient metaClient, HoodieFailedWritesCleaningPolicy cleaningPolicy) { + protected List getInstantsToRollback(HoodieTableMetaClient metaClient, HoodieFailedWritesCleaningPolicy cleaningPolicy, Option curInstantTime) { Stream inflightInstantsStream = getInflightTimelineExcludeCompactionAndClustering(metaClient) .getReverseOrderedInstants(); if (cleaningPolicy.isEager()) { - return inflightInstantsStream.map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + return inflightInstantsStream.map(HoodieInstant::getTimestamp).filter(entry -> { + if (curInstantTime.isPresent()) { + return !entry.equals(curInstantTime.get()); + } else { + return true; + } + }).collect(Collectors.toList()); } else if (cleaningPolicy.isLazy()) { return inflightInstantsStream.filter(instant -> { try { @@ -947,17 +992,17 @@ private Option scheduleTableServiceInternal(String instantTime, Option clusteringPlan = createTable(config, hadoopConf) + Option clusteringPlan = createTable(config, hadoopConf, config.isMetadataTableEnabled()) .scheduleClustering(context, instantTime, extraMetadata); return clusteringPlan.isPresent() ? Option.of(instantTime) : Option.empty(); case COMPACT: LOG.info("Scheduling compaction at instant time :" + instantTime); - Option compactionPlan = createTable(config, hadoopConf) + Option compactionPlan = createTable(config, hadoopConf, config.isMetadataTableEnabled()) .scheduleCompaction(context, instantTime, extraMetadata); return compactionPlan.isPresent() ? Option.of(instantTime) : Option.empty(); case CLEAN: LOG.info("Scheduling cleaning at instant time :" + instantTime); - Option cleanerPlan = createTable(config, hadoopConf) + Option cleanerPlan = createTable(config, hadoopConf, config.isMetadataTableEnabled()) .scheduleCleaning(context, instantTime, extraMetadata); return cleanerPlan.isPresent() ? Option.of(instantTime) : Option.empty(); default: @@ -978,7 +1023,9 @@ protected Option inlineCluster(Option> extraMetadata } protected void rollbackInflightClustering(HoodieInstant inflightInstant, HoodieTable table) { - table.rollback(context, HoodieActiveTimeline.createNewInstantTime(), inflightInstant, false); + String commitTime = HoodieActiveTimeline.createNewInstantTime(); + table.scheduleRollback(context, commitTime, inflightInstant, false); + table.rollback(context, commitTime, inflightInstant, false, false); table.getActiveTimeline().revertReplaceCommitInflightToRequested(inflightInstant); } @@ -1009,7 +1056,7 @@ public HoodieMetrics getMetrics() { return metrics; } - public HoodieIndex getIndex() { + public HoodieIndex getIndex() { return index; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java index a73138440219a..8f74858669278 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java @@ -201,6 +201,7 @@ public void setTotalErrorRecords(long totalErrorRecords) { public String toString() { final StringBuilder sb = new StringBuilder("WriteStatus {"); sb.append("fileId=").append(fileId); + sb.append(", writeStat=").append(stat); sb.append(", globalError='").append(globalError).append('\''); sb.append(", hasErrors='").append(hasErrors()).append('\''); sb.append(", errorCount='").append(totalErrorRecords).append('\''); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HeartbeatUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HeartbeatUtils.java index 2fe0fef8bccfb..80191d4c3cdc2 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HeartbeatUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HeartbeatUtils.java @@ -29,7 +29,6 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.File; import java.io.IOException; import java.util.List; import java.util.Set; @@ -53,7 +52,7 @@ public static boolean deleteHeartbeatFile(FileSystem fs, String basePath, String boolean deleted = false; try { String heartbeatFolderPath = HoodieTableMetaClient.getHeartbeatFolderPath(basePath); - deleted = fs.delete(new Path(heartbeatFolderPath + File.separator + instantTime), false); + deleted = fs.delete(new Path(heartbeatFolderPath + Path.SEPARATOR + instantTime), false); if (!deleted) { LOG.error("Failed to delete heartbeat for instant " + instantTime); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HoodieHeartbeatClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HoodieHeartbeatClient.java index cb855c25dc671..341d72c754a95 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HoodieHeartbeatClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HoodieHeartbeatClient.java @@ -29,7 +29,6 @@ import org.apache.log4j.Logger; import javax.annotation.concurrent.NotThreadSafe; -import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.Serializable; @@ -207,7 +206,7 @@ public void stop() throws HoodieException { } public static Long getLastHeartbeatTime(FileSystem fs, String basePath, String instantTime) throws IOException { - Path heartbeatFilePath = new Path(HoodieTableMetaClient.getHeartbeatFolderPath(basePath) + File.separator + instantTime); + Path heartbeatFilePath = new Path(HoodieTableMetaClient.getHeartbeatFolderPath(basePath) + Path.SEPARATOR + instantTime); if (fs.exists(heartbeatFilePath)) { return fs.getFileStatus(heartbeatFilePath).getModificationTime(); } else { @@ -217,7 +216,7 @@ public static Long getLastHeartbeatTime(FileSystem fs, String basePath, String i } public static Boolean heartbeatExists(FileSystem fs, String basePath, String instantTime) throws IOException { - Path heartbeatFilePath = new Path(HoodieTableMetaClient.getHeartbeatFolderPath(basePath) + File.separator + instantTime); + Path heartbeatFilePath = new Path(HoodieTableMetaClient.getHeartbeatFolderPath(basePath) + Path.SEPARATOR + instantTime); if (fs.exists(heartbeatFilePath)) { return true; } @@ -255,7 +254,7 @@ private void updateHeartbeat(String instantTime) throws HoodieHeartbeatException try { Long newHeartbeatTime = System.currentTimeMillis(); OutputStream outputStream = - this.fs.create(new Path(heartbeatFolderPath + File.separator + instantTime), true); + this.fs.create(new Path(heartbeatFolderPath + Path.SEPARATOR + instantTime), true); outputStream.close(); Heartbeat heartbeat = instantToHeartbeatMap.get(instantTime); if (heartbeat.getLastHeartbeatTime() != null && isHeartbeatExpired(instantTime)) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java index 80a412010203e..39f397ab170d6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java @@ -58,7 +58,7 @@ public static Option resolveWriteConflictIfAny(final Hoodi if (config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { ConflictResolutionStrategy resolutionStrategy = config.getWriteConflictResolutionStrategy(); Stream instantStream = resolutionStrategy.getCandidateInstants(table.getActiveTimeline(), currentTxnOwnerInstant.get(), lastCompletedTxnOwnerInstant); - final ConcurrentOperation thisOperation = new ConcurrentOperation(currentTxnOwnerInstant.get(), thisCommitMetadata.get()); + final ConcurrentOperation thisOperation = new ConcurrentOperation(currentTxnOwnerInstant.get(), thisCommitMetadata.orElse(new HoodieCommitMetadata())); instantStream.forEach(instant -> { try { ConcurrentOperation otherOperation = new ConcurrentOperation(instant, table.getMetaClient()); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java index f1e930b126f41..5fcd9dfd60be4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java @@ -22,10 +22,12 @@ import org.apache.hudi.common.config.ConfigGroups; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.exception.HoodieException; import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.util.Locale; import java.util.Properties; /** @@ -40,6 +42,9 @@ public class HoodieClusteringConfig extends HoodieConfig { // Any strategy specific params can be saved with this prefix public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy."; + // Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix + public static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize."; + public static final ConfigProperty DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions") .defaultValue("2") @@ -133,10 +138,59 @@ public class HoodieClusteringConfig extends HoodieConfig { public static final ConfigProperty PRESERVE_COMMIT_METADATA = ConfigProperty .key("hoodie.clustering.preserve.commit.metadata") - .defaultValue(false) + .defaultValue(true) .sinceVersion("0.9.0") .withDocumentation("When rewriting data, preserves existing hoodie_commit_time"); + public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable") + .defaultValue(false) + .sinceVersion("0.10.0") + .withDocumentation("Enable use z-ordering/space-filling curves to optimize the layout of table to boost query performance. " + + "This parameter takes precedence over clustering strategy set using " + EXECUTION_STRATEGY_CLASS_NAME.key()); + + public static final ConfigProperty LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") + .defaultValue("z-order") + .sinceVersion("0.10.0") + .withDocumentation("Type of layout optimization to be applied, current only supports `z-order` and `hilbert` curves."); + + /** + * There exists two method to build z-curve. + * one is directly mapping sort cols to z-value to build z-curve; + * we can find this method in Amazon DynamoDB https://aws.amazon.com/cn/blogs/database/tag/z-order/ + * the other one is Boundary-based Interleaved Index method which we proposed. simply call it sample method. + * Refer to rfc-28 for specific algorithm flow. + * Boundary-based Interleaved Index method has better generalization, but the build speed is slower than direct method. + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.build.method") + .defaultValue("direct") + .sinceVersion("0.10.0") + .withDocumentation("Controls how data is sampled to build the space filling curves. two methods: `direct`,`sample`." + + "The direct method is faster than the sampling, however sample method would produce a better data layout."); + /** + * Doing sample for table data is the first step in Boundary-based Interleaved Index method. + * larger sample number means better optimize result, but more memory consumption + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.size") + .defaultValue("200000") + .sinceVersion("0.10.0") + .withDocumentation("when setting" + LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD.key() + " to `sample`, the amount of sampling to be done." + + "Large sample size leads to better results, at the expense of more memory usage."); + + /** + * The best way to use Z-order/Space-filling curves is to cooperate with Data-Skipping + * with data-skipping query engine can greatly reduce the number of table files to be read. + * otherwise query engine can only do row-group skipping for files (parquet/orc) + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable") + .defaultValue(true) + .sinceVersion("0.10.0") + .withDocumentation("Enable data skipping by collecting statistics once layout optimization is complete."); + /** * @deprecated Use {@link #PLAN_STRATEGY_CLASS_NAME} and its methods instead */ @@ -350,9 +404,58 @@ public Builder withPreserveHoodieCommitMetadata(Boolean preserveHoodieCommitMeta return this; } + public Builder withSpaceFillingCurveDataOptimizeEnable(Boolean enable) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_ENABLE, String.valueOf(enable)); + return this; + } + + public Builder withDataOptimizeStrategy(String strategy) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_STRATEGY, strategy); + return this; + } + + public Builder withDataOptimizeBuildCurveStrategy(String method) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD, method); + return this; + } + + public Builder withDataOptimizeBuildCurveSampleNumber(int sampleNumber) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE, String.valueOf(sampleNumber)); + return this; + } + + public Builder withDataOptimizeDataSkippingEnable(boolean dataSkipping) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE, String.valueOf(dataSkipping)); + return this; + } + public HoodieClusteringConfig build() { clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName()); return clusteringConfig; } } + + /** + * strategy types for build z-ordering/space-filling curves. + */ + public enum BuildCurveStrategyType { + DIRECT("direct"), + SAMPLE("sample"); + private final String value; + + BuildCurveStrategyType(String value) { + this.value = value; + } + + public static BuildCurveStrategyType fromValue(String value) { + switch (value.toLowerCase(Locale.ROOT)) { + case "direct": + return DIRECT; + case "sample": + return SAMPLE; + default: + throw new HoodieException("Invalid value of Type."); + } + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java index 7d048d53bcd0a..561460777ff6b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java @@ -148,7 +148,7 @@ public class HoodieHBaseIndexConfig extends HoodieConfig { .defaultValue(false) .withDocumentation("Only applies if index type is HBASE. " + "When an already existing record is upserted to a new partition compared to whats in storage, " - + "this config when set, will delete old record in old paritition " + + "this config when set, will delete old record in old partition " + "and will insert it as new record in new partition."); public static final ConfigProperty ROLLBACK_SYNC_ENABLE = ConfigProperty diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java index 0553c86fb3a3c..22118da471347 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java @@ -110,6 +110,23 @@ public class HoodieStorageConfig extends HoodieConfig { .defaultValue("gzip") .withDocumentation("Compression Codec for parquet files"); + public static final ConfigProperty PARQUET_DICTIONARY_ENABLED = ConfigProperty + .key("hoodie.parquet.dictionary.enabled") + .defaultValue(true) + .withDocumentation("Whether to use dictionary encoding"); + + public static final ConfigProperty PARQUET_WRITE_LEGACY_FORMAT_ENABLED = ConfigProperty + .key("hoodie.parquet.writeLegacyFormat.enabled") + .defaultValue("false") + .withDocumentation("Sets spark.sql.parquet.writeLegacyFormat. If true, data will be written in a way of Spark 1.4 and earlier. " + + "For example, decimal values will be written in Parquet's fixed-length byte array format which other systems such as Apache Hive and Apache Impala use. " + + "If false, the newer format in Parquet will be used. For example, decimals will be written in int-based format."); + + public static final ConfigProperty PARQUET_OUTPUT_TIMESTAMP_TYPE = ConfigProperty + .key("hoodie.parquet.outputTimestampType") + .defaultValue("TIMESTAMP_MILLIS") + .withDocumentation("Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files."); + public static final ConfigProperty HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty .key("hoodie.hfile.compression.algorithm") .defaultValue("GZ") @@ -307,6 +324,16 @@ public Builder parquetCompressionCodec(String parquetCompressionCodec) { return this; } + public Builder parquetWriteLegacyFormat(String parquetWriteLegacyFormat) { + storageConfig.setValue(PARQUET_WRITE_LEGACY_FORMAT_ENABLED, parquetWriteLegacyFormat); + return this; + } + + public Builder parquetOutputTimestampType(String parquetOutputTimestampType) { + storageConfig.setValue(PARQUET_OUTPUT_TIMESTAMP_TYPE, parquetOutputTimestampType); + return this; + } + public Builder hfileCompressionAlgorithm(String hfileCompressionAlgorithm) { storageConfig.setValue(HFILE_COMPRESSION_ALGORITHM_NAME, hfileCompressionAlgorithm); return this; @@ -342,5 +369,4 @@ public HoodieStorageConfig build() { return storageConfig; } } - } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 4df7d0deb6db0..736fe3b471bcb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -40,6 +40,11 @@ import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.metrics.HoodieMetricsConfig; +import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig; +import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; +import org.apache.hudi.config.metrics.HoodieMetricsPrometheusConfig; import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.SimpleAvroKeyGenerator; @@ -1223,6 +1228,30 @@ public String getClusteringSortColumns() { return getString(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS); } + /** + * Data layout optimize properties. + */ + public boolean isLayoutOptimizationEnabled() { + return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE); + } + + public String getLayoutOptimizationStrategy() { + return getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY); + } + + public HoodieClusteringConfig.BuildCurveStrategyType getLayoutOptimizationCurveBuildMethod() { + return HoodieClusteringConfig.BuildCurveStrategyType.fromValue( + getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD)); + } + + public int getLayoutOptimizationSampleSize() { + return getInt(HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE); + } + + public boolean isDataSkippingEnabled() { + return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE); + } + /** * index properties. */ @@ -1398,10 +1427,6 @@ public int getLogFileDataBlockMaxSize() { return getInt(HoodieStorageConfig.LOGFILE_DATA_BLOCK_MAX_SIZE); } - public long getLogFileMaxSize() { - return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE); - } - public double getParquetCompressionRatio() { return getDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION); } @@ -1410,6 +1435,22 @@ public CompressionCodecName getParquetCompressionCodec() { return CompressionCodecName.fromConf(getString(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME)); } + public boolean parquetDictionaryEnabled() { + return getBoolean(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED); + } + + public String parquetWriteLegacyFormatEnabled() { + return getString(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED); + } + + public String parquetOutputTimestampType() { + return getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE); + } + + public long getLogFileMaxSize() { + return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE); + } + public double getLogFileToParquetCompressionRatio() { return getDouble(HoodieStorageConfig.LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION); } @@ -1459,23 +1500,27 @@ public MetricsReporterType getMetricsReporterType() { } public String getGraphiteServerHost() { - return getString(HoodieMetricsConfig.GRAPHITE_SERVER_HOST_NAME); + return getString(HoodieMetricsGraphiteConfig.GRAPHITE_SERVER_HOST_NAME); } public int getGraphiteServerPort() { - return getInt(HoodieMetricsConfig.GRAPHITE_SERVER_PORT_NUM); + return getInt(HoodieMetricsGraphiteConfig.GRAPHITE_SERVER_PORT_NUM); } public String getGraphiteMetricPrefix() { - return getString(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX_VALUE); + return getString(HoodieMetricsGraphiteConfig.GRAPHITE_METRIC_PREFIX_VALUE); + } + + public int getGraphiteReportPeriodSeconds() { + return getInt(HoodieMetricsGraphiteConfig.GRAPHITE_REPORT_PERIOD_IN_SECONDS); } public String getJmxHost() { - return getString(HoodieMetricsConfig.JMX_HOST_NAME); + return getString(HoodieMetricsJmxConfig.JMX_HOST_NAME); } public String getJmxPort() { - return getString(HoodieMetricsConfig.JMX_PORT_NUM); + return getString(HoodieMetricsJmxConfig.JMX_PORT_NUM); } public int getDatadogReportPeriodSeconds() { @@ -1666,10 +1711,6 @@ public boolean isMetadataTableEnabled() { return metadataConfig.enabled(); } - public boolean getFileListingMetadataVerify() { - return metadataConfig.validateFileListingMetadata(); - } - public int getMetadataInsertParallelism() { return getInt(HoodieMetadataConfig.INSERT_PARALLELISM_VALUE); } @@ -1767,6 +1808,7 @@ public static class Builder { private boolean isStorageConfigSet = false; private boolean isCompactionConfigSet = false; private boolean isClusteringConfigSet = false; + private boolean isOptimizeConfigSet = false; private boolean isMetricsConfigSet = false; private boolean isBootstrapConfigSet = false; private boolean isMemoryConfigSet = false; @@ -1777,6 +1819,8 @@ public static class Builder { private boolean isMetadataConfigSet = false; private boolean isLockConfigSet = false; private boolean isPreCommitValidationConfigSet = false; + private boolean isMetricsJmxConfigSet = false; + private boolean isMetricsGraphiteConfigSet = false; public Builder withEngineType(EngineType engineType) { this.engineType = engineType; @@ -1931,6 +1975,18 @@ public Builder withLockConfig(HoodieLockConfig lockConfig) { return this; } + public Builder withMetricsJmxConfig(HoodieMetricsJmxConfig metricsJmxConfig) { + writeConfig.getProps().putAll(metricsJmxConfig.getProps()); + isMetricsJmxConfigSet = true; + return this; + } + + public Builder withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig mericsGraphiteConfig) { + writeConfig.getProps().putAll(mericsGraphiteConfig.getProps()); + isMetricsGraphiteConfigSet = true; + return this; + } + public Builder withPreCommitValidatorConfig(HoodiePreCommitValidatorConfig validatorConfig) { writeConfig.getProps().putAll(validatorConfig.getProps()); isPreCommitValidationConfigSet = true; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java similarity index 62% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java index e1d0b63ef35c4..60369d72fb4fc 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.config; +package org.apache.hudi.config.metrics; import org.apache.hudi.common.config.ConfigClassProperty; import org.apache.hudi.common.config.ConfigGroups; @@ -55,42 +55,6 @@ public class HoodieMetricsConfig extends HoodieConfig { .sinceVersion("0.5.0") .withDocumentation("Type of metrics reporter."); - // Graphite - public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; - - public static final ConfigProperty GRAPHITE_SERVER_HOST_NAME = ConfigProperty - .key(GRAPHITE_PREFIX + ".host") - .defaultValue("localhost") - .sinceVersion("0.5.0") - .withDocumentation("Graphite host to connect to"); - - public static final ConfigProperty GRAPHITE_SERVER_PORT_NUM = ConfigProperty - .key(GRAPHITE_PREFIX + ".port") - .defaultValue(4756) - .sinceVersion("0.5.0") - .withDocumentation("Graphite port to connect to"); - - // Jmx - public static final String JMX_PREFIX = METRIC_PREFIX + ".jmx"; - - public static final ConfigProperty JMX_HOST_NAME = ConfigProperty - .key(JMX_PREFIX + ".host") - .defaultValue("localhost") - .sinceVersion("0.5.1") - .withDocumentation("Jmx host to connect to"); - - public static final ConfigProperty JMX_PORT_NUM = ConfigProperty - .key(JMX_PREFIX + ".port") - .defaultValue(9889) - .sinceVersion("0.5.1") - .withDocumentation("Jmx port to connect to"); - - public static final ConfigProperty GRAPHITE_METRIC_PREFIX_VALUE = ConfigProperty - .key(GRAPHITE_PREFIX + ".metric.prefix") - .noDefaultValue() - .sinceVersion("0.5.1") - .withDocumentation("Standard prefix applied to all metrics. This helps to add datacenter, environment information for e.g"); - // User defined public static final ConfigProperty METRICS_REPORTER_CLASS_NAME = ConfigProperty .key(METRIC_PREFIX + ".reporter.class") @@ -125,51 +89,6 @@ public class HoodieMetricsConfig extends HoodieConfig { */ @Deprecated public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = METRICS_REPORTER_TYPE_VALUE.defaultValue(); - /** - * @deprecated Use {@link #GRAPHITE_SERVER_HOST_NAME} and its methods instead - */ - @Deprecated - public static final String GRAPHITE_SERVER_HOST = GRAPHITE_SERVER_HOST_NAME.key(); - /** - * @deprecated Use {@link #GRAPHITE_SERVER_HOST_NAME} and its methods instead - */ - @Deprecated - public static final String DEFAULT_GRAPHITE_SERVER_HOST = GRAPHITE_SERVER_HOST_NAME.defaultValue(); - /** - * @deprecated Use {@link #GRAPHITE_SERVER_PORT_NUM} and its methods instead - */ - @Deprecated - public static final String GRAPHITE_SERVER_PORT = GRAPHITE_SERVER_PORT_NUM.key(); - /** - * @deprecated Use {@link #GRAPHITE_SERVER_PORT_NUM} and its methods instead - */ - @Deprecated - public static final int DEFAULT_GRAPHITE_SERVER_PORT = GRAPHITE_SERVER_PORT_NUM.defaultValue(); - /** - * @deprecated Use {@link #JMX_HOST_NAME} and its methods instead - */ - @Deprecated - public static final String JMX_HOST = JMX_HOST_NAME.key(); - /** - * @deprecated Use {@link #JMX_HOST_NAME} and its methods instead - */ - @Deprecated - public static final String DEFAULT_JMX_HOST = JMX_HOST_NAME.defaultValue(); - /** - * @deprecated Use {@link #JMX_PORT_NUM} and its methods instead - */ - @Deprecated - public static final String JMX_PORT = JMX_PORT_NUM.key(); - /** - * @deprecated Use {@link #JMX_PORT_NUM} and its methods instead - */ - @Deprecated - public static final int DEFAULT_JMX_PORT = JMX_PORT_NUM.defaultValue(); - /** - * @deprecated Use {@link #GRAPHITE_METRIC_PREFIX_VALUE} and its methods instead - */ - @Deprecated - public static final String GRAPHITE_METRIC_PREFIX = GRAPHITE_METRIC_PREFIX_VALUE.key(); /** * @deprecated Use {@link #METRICS_REPORTER_CLASS_NAME} and its methods instead */ @@ -220,31 +139,6 @@ public Builder withReporterType(String reporterType) { return this; } - public Builder toGraphiteHost(String host) { - hoodieMetricsConfig.setValue(GRAPHITE_SERVER_HOST_NAME, host); - return this; - } - - public Builder onGraphitePort(int port) { - hoodieMetricsConfig.setValue(GRAPHITE_SERVER_PORT_NUM, String.valueOf(port)); - return this; - } - - public Builder toJmxHost(String host) { - hoodieMetricsConfig.setValue(JMX_HOST_NAME, host); - return this; - } - - public Builder onJmxPort(String port) { - hoodieMetricsConfig.setValue(JMX_PORT_NUM, port); - return this; - } - - public Builder usePrefix(String prefix) { - hoodieMetricsConfig.setValue(GRAPHITE_METRIC_PREFIX_VALUE, prefix); - return this; - } - public Builder withReporterClass(String className) { hoodieMetricsConfig.setValue(METRICS_REPORTER_CLASS_NAME, className); return this; @@ -267,6 +161,10 @@ public HoodieMetricsConfig build() { HoodieMetricsPrometheusConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.PROMETHEUS, HoodieMetricsPrometheusConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.JMX, + HoodieMetricsJmxConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.GRAPHITE, + HoodieMetricsGraphiteConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); return hoodieMetricsConfig; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsDatadogConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsDatadogConfig.java similarity index 98% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsDatadogConfig.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsDatadogConfig.java index d63cb0fefa309..3fc306b8cdd3b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsDatadogConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsDatadogConfig.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.config; +package org.apache.hudi.config.metrics; import org.apache.hudi.common.config.ConfigClassProperty; import org.apache.hudi.common.config.ConfigGroups; @@ -27,7 +27,7 @@ import java.util.Properties; -import static org.apache.hudi.config.HoodieMetricsConfig.METRIC_PREFIX; +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; /** * Configs for Datadog reporter type. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsGraphiteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsGraphiteConfig.java new file mode 100644 index 0000000000000..25c4c6af4a4c2 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsGraphiteConfig.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; + +/** + * Configs for Graphite reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#GRAPHITE} + */ +@ConfigClassProperty(name = "Metrics Configurations for Graphite", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics using Graphite. " + + " Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsGraphiteConfig extends HoodieConfig { + + public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; + + public static final ConfigProperty GRAPHITE_SERVER_HOST_NAME = ConfigProperty + .key(GRAPHITE_PREFIX + ".host") + .defaultValue("localhost") + .sinceVersion("0.5.0") + .withDocumentation("Graphite host to connect to."); + + public static final ConfigProperty GRAPHITE_SERVER_PORT_NUM = ConfigProperty + .key(GRAPHITE_PREFIX + ".port") + .defaultValue(4756) + .sinceVersion("0.5.0") + .withDocumentation("Graphite port to connect to."); + + public static final ConfigProperty GRAPHITE_METRIC_PREFIX_VALUE = ConfigProperty + .key(GRAPHITE_PREFIX + ".metric.prefix") + .noDefaultValue() + .sinceVersion("0.5.1") + .withDocumentation("Standard prefix applied to all metrics. This helps to add datacenter, environment information for e.g"); + + public static final ConfigProperty GRAPHITE_REPORT_PERIOD_IN_SECONDS = ConfigProperty + .key(GRAPHITE_PREFIX + ".report.period.seconds") + .defaultValue(30) + .sinceVersion("0.10.0") + .withDocumentation("Graphite reporting period in seconds. Default to 30."); + + /** + * @deprecated Use {@link #GRAPHITE_SERVER_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String GRAPHITE_SERVER_HOST = GRAPHITE_SERVER_HOST_NAME.key(); + /** + * @deprecated Use {@link #GRAPHITE_SERVER_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_GRAPHITE_SERVER_HOST = GRAPHITE_SERVER_HOST_NAME.defaultValue(); + /** + * @deprecated Use {@link #GRAPHITE_SERVER_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String GRAPHITE_SERVER_PORT = GRAPHITE_SERVER_PORT_NUM.key(); + /** + * @deprecated Use {@link #GRAPHITE_SERVER_PORT_NUM} and its methods instead + */ + @Deprecated + public static final int DEFAULT_GRAPHITE_SERVER_PORT = GRAPHITE_SERVER_PORT_NUM.defaultValue(); + /** + * @deprecated Use {@link #GRAPHITE_METRIC_PREFIX_VALUE} and its methods instead + */ + @Deprecated + public static final String GRAPHITE_METRIC_PREFIX = GRAPHITE_METRIC_PREFIX_VALUE.key(); + + private HoodieMetricsGraphiteConfig() { + super(); + } + + public static HoodieMetricsGraphiteConfig.Builder newBuilder() { + return new HoodieMetricsGraphiteConfig.Builder(); + } + + public static class Builder { + + private final HoodieMetricsGraphiteConfig hoodieMetricsGraphiteConfig = new HoodieMetricsGraphiteConfig(); + + public HoodieMetricsGraphiteConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.hoodieMetricsGraphiteConfig.getProps().load(reader); + return this; + } + } + + public HoodieMetricsGraphiteConfig.Builder fromProperties(Properties props) { + this.hoodieMetricsGraphiteConfig.getProps().putAll(props); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder toGraphiteHost(String host) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_SERVER_HOST_NAME, host); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder onGraphitePort(int port) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_SERVER_PORT_NUM, String.valueOf(port)); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder usePrefix(String prefix) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_METRIC_PREFIX_VALUE, prefix); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder periodSeconds(String periodSeconds) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_REPORT_PERIOD_IN_SECONDS, periodSeconds); + return this; + } + + public HoodieMetricsGraphiteConfig build() { + hoodieMetricsGraphiteConfig.setDefaults(HoodieMetricsGraphiteConfig.class.getName()); + return hoodieMetricsGraphiteConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsJmxConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsJmxConfig.java new file mode 100644 index 0000000000000..e3a57a1c5caf4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsJmxConfig.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; + +/** + * Configs for Jmx reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#JMX} + */ +@ConfigClassProperty(name = "Metrics Configurations for Jmx", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics using Jmx. " + + " Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsJmxConfig extends HoodieConfig { + + public static final String JMX_PREFIX = METRIC_PREFIX + ".jmx"; + + public static final ConfigProperty JMX_HOST_NAME = ConfigProperty + .key(JMX_PREFIX + ".host") + .defaultValue("localhost") + .sinceVersion("0.5.1") + .withDocumentation("Jmx host to connect to"); + + public static final ConfigProperty JMX_PORT_NUM = ConfigProperty + .key(JMX_PREFIX + ".port") + .defaultValue(9889) + .sinceVersion("0.5.1") + .withDocumentation("Jmx port to connect to"); + + /** + * @deprecated Use {@link #JMX_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String JMX_HOST = JMX_HOST_NAME.key(); + /** + * @deprecated Use {@link #JMX_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_JMX_HOST = JMX_HOST_NAME.defaultValue(); + /** + * @deprecated Use {@link #JMX_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String JMX_PORT = JMX_PORT_NUM.key(); + /** + * @deprecated Use {@link #JMX_PORT_NUM} and its methods instead + */ + @Deprecated + public static final int DEFAULT_JMX_PORT = JMX_PORT_NUM.defaultValue(); + + private HoodieMetricsJmxConfig() { + super(); + } + + public static HoodieMetricsJmxConfig.Builder newBuilder() { + return new HoodieMetricsJmxConfig.Builder(); + } + + public static class Builder { + + private final HoodieMetricsJmxConfig hoodieMetricsJmxConfig = new HoodieMetricsJmxConfig(); + + public HoodieMetricsJmxConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.hoodieMetricsJmxConfig.getProps().load(reader); + return this; + } + } + + public HoodieMetricsJmxConfig.Builder fromProperties(Properties props) { + this.hoodieMetricsJmxConfig.getProps().putAll(props); + return this; + } + + public HoodieMetricsJmxConfig.Builder toJmxHost(String host) { + hoodieMetricsJmxConfig.setValue(JMX_HOST_NAME, host); + return this; + } + + public HoodieMetricsJmxConfig.Builder onJmxPort(String port) { + hoodieMetricsJmxConfig.setValue(JMX_PORT_NUM, port); + return this; + } + + public HoodieMetricsJmxConfig build() { + hoodieMetricsJmxConfig.setDefaults(HoodieMetricsJmxConfig.class.getName()); + return hoodieMetricsJmxConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsPrometheusConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsPrometheusConfig.java similarity index 78% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsPrometheusConfig.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsPrometheusConfig.java index 9ec2ec62e460e..c04e8aa1e980e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsPrometheusConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsPrometheusConfig.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.config; +package org.apache.hudi.config.metrics; import org.apache.hudi.common.config.ConfigClassProperty; import org.apache.hudi.common.config.ConfigGroups; @@ -25,8 +25,14 @@ import java.util.Properties; -import static org.apache.hudi.config.HoodieMetricsConfig.METRIC_PREFIX; +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; +/** + * Configs for Prometheus/Pushgaeway reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#PROMETHEUS} + * {@link org.apache.hudi.metrics.MetricsReporterType#PROMETHEUS_PUSHGATEWAY} + */ @ConfigClassProperty(name = "Metrics Configurations for Prometheus", groupName = ConfigGroups.Names.METRICS, description = "Enables reporting on Hudi metrics using Prometheus. " @@ -174,6 +180,41 @@ public Builder fromProperties(Properties props) { return this; } + public HoodieMetricsPrometheusConfig.Builder withPushgatewayHostName(String hostName) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_HOST_NAME, String.valueOf(hostName)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayPortNum(Integer pushgatewayPortNum) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_PORT_NUM, String.valueOf(pushgatewayPortNum)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayReportPeriodInSeconds(String periodTime) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS, periodTime); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayDeleteOnShutdownEnable(boolean deleteOnShutdownEnable) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE, String.valueOf(deleteOnShutdownEnable)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayJobname(String jobname) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_JOBNAME, jobname); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayRandomJobnameSuffix(boolean randomJobnameSuffix) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX, String.valueOf(randomJobnameSuffix)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPrometheusPortNum(int prometheusPortNum) { + hoodieMetricsPrometheusConfig.setValue(PROMETHEUS_PORT_NUM, String.valueOf(prometheusPortNum)); + return this; + } + public HoodieMetricsPrometheusConfig build() { hoodieMetricsPrometheusConfig.setDefaults(HoodieMetricsPrometheusConfig.class.getName()); return hoodieMetricsPrometheusConfig; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java index 2e1915ff20431..974f4d546b100 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java @@ -21,12 +21,16 @@ import org.apache.hudi.ApiMaturityLevel; import org.apache.hudi.PublicAPIClass; import org.apache.hudi.PublicAPIMethod; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.table.HoodieTable; import java.io.Serializable; @@ -35,9 +39,9 @@ * Base class for different types of indexes to determine the mapping from uuid. * * @param Sub type of HoodieRecordPayload - * @param Type of inputs - * @param Type of keys - * @param Type of outputs + * @param Type of inputs for deprecated APIs + * @param Type of keys for deprecated APIs + * @param Type of outputs for deprecated APIs */ @PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) public abstract class HoodieIndex implements Serializable { @@ -52,18 +56,39 @@ protected HoodieIndex(HoodieWriteConfig config) { * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually * present). */ - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) - public abstract I tagLocation(I records, HoodieEngineContext context, - HoodieTable hoodieTable) throws HoodieIndexException; + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + public I tagLocation(I records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + throw new HoodieNotSupportedException("Deprecated API should not be called"); + } /** * Extracts the location of written records, and updates the index. - *

- * TODO(vc): We may need to propagate the record as well in a WriteStatus class */ - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) - public abstract O updateLocation(O writeStatuses, HoodieEngineContext context, - HoodieTable hoodieTable) throws HoodieIndexException; + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + public O updateLocation(O writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + throw new HoodieNotSupportedException("Deprecated API should not be called"); + } + + /** + * Looks up the index and tags each incoming record with a location of a file that contains + * the row (if it is actually present). + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public abstract HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; + + /** + * Extracts the location of written records, and updates the index. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public abstract HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; /** * Rollback the effects of the commit made at instantTime. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java new file mode 100644 index 0000000000000..9f0e815632f38 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +/** + * Helper for {@link HoodieBloomIndex} containing engine-specific logic. + */ +public abstract class BaseHoodieBloomIndexHelper implements Serializable { + /** + * Find out pair. + * + * @param config Write config. + * @param context {@link HoodieEngineContext} instance to use. + * @param hoodieTable {@link HoodieTable} instance to use. + * @param partitionRecordKeyPairs Pairs of partition path and record key. + * @param fileComparisonPairs Pairs of filename and record key based on file comparisons. + * @param partitionToFileInfo Partition path to {@link BloomIndexFileInfo} map. + * @param recordsPerPartition Number of records per partition in a map. + * @return {@link HoodiePairData} of {@link HoodieKey} and {@link HoodieRecordLocation} pairs. + */ + public abstract HoodiePairData findMatchingFilesForRecordKeys( + HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, + HoodiePairData partitionRecordKeyPairs, + HoodieData> fileComparisonPairs, + Map> partitionToFileInfo, + Map recordsPerPartition); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndex.java deleted file mode 100644 index 75ab693d132e2..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndex.java +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.bloom; - -import com.beust.jcommander.internal.Lists; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.MetadataNotFoundException; -import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.io.HoodieKeyLookupHandle; -import org.apache.hudi.io.HoodieRangeInfoHandle; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import static java.util.stream.Collectors.groupingBy; -import static java.util.stream.Collectors.mapping; -import static java.util.stream.Collectors.toList; -import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; - -@SuppressWarnings("checkstyle:LineLength") -public class HoodieBaseBloomIndex extends HoodieIndex>, List, List> { - - private static final Logger LOG = LogManager.getLogger(HoodieBaseBloomIndex.class); - - public HoodieBaseBloomIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public List> tagLocation(List> records, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) { - // Step 1: Extract out thinner Map of (partitionPath, recordKey) - Map> partitionRecordKeyMap = new HashMap<>(); - records.forEach(record -> { - if (partitionRecordKeyMap.containsKey(record.getPartitionPath())) { - partitionRecordKeyMap.get(record.getPartitionPath()).add(record.getRecordKey()); - } else { - List recordKeys = Lists.newArrayList(); - recordKeys.add(record.getRecordKey()); - partitionRecordKeyMap.put(record.getPartitionPath(), recordKeys); - } - }); - - // Step 2: Lookup indexes for all the partition/recordkey pair - Map keyFilenamePairMap = - lookupIndex(partitionRecordKeyMap, context, hoodieTable); - - if (LOG.isDebugEnabled()) { - long totalTaggedRecords = keyFilenamePairMap.values().size(); - LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); - } - - // Step 3: Tag the incoming records, as inserts or updates, by joining with existing record keys - List> taggedRecords = tagLocationBacktoRecords(keyFilenamePairMap, records); - - return taggedRecords; - } - - /** - * Lookup the location for each record key and return the pair for all record keys already - * present and drop the record keys if not present. - */ - private Map lookupIndex( - Map> partitionRecordKeyMap, final HoodieEngineContext context, - final HoodieTable hoodieTable) { - // Obtain records per partition, in the incoming records - Map recordsPerPartition = new HashMap<>(); - partitionRecordKeyMap.keySet().forEach(k -> recordsPerPartition.put(k, Long.valueOf(partitionRecordKeyMap.get(k).size()))); - List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); - - // Step 2: Load all involved files as pairs - List> fileInfoList = - loadInvolvedFiles(affectedPartitionPathList, context, hoodieTable); - final Map> partitionToFileInfo = - fileInfoList.stream().collect(groupingBy(Pair::getLeft, mapping(Pair::getRight, toList()))); - - // Step 3: Obtain a List, for each incoming record, that already exists, with the file id, - // that contains it. - List> fileComparisons = - explodeRecordsWithFileComparisons(partitionToFileInfo, partitionRecordKeyMap); - return findMatchingFilesForRecordKeys(fileComparisons, hoodieTable); - } - - /** - * Load all involved files as pair List. - */ - //TODO duplicate code with spark, we can optimize this method later - List> loadInvolvedFiles(List partitions, final HoodieEngineContext context, - final HoodieTable hoodieTable) { - // Obtain the latest data files from all the partitions. - List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() - .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) - .collect(toList()); - - if (config.getBloomIndexPruneByRanges()) { - // also obtain file ranges, if range pruning is enabled - context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); - return context.map(partitionPathFileIDList, pf -> { - try { - HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); - String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); - return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); - } catch (MetadataNotFoundException me) { - LOG.warn("Unable to find range metadata in file :" + pf); - return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); - } - }, Math.max(partitionPathFileIDList.size(), 1)); - } else { - return partitionPathFileIDList.stream() - .map(pf -> Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); - } - } - - @Override - public boolean rollbackCommit(String instantTime) { - // Nope, don't need to do anything. - return true; - } - - /** - * This is not global, since we depend on the partitionPath to do the lookup. - */ - @Override - public boolean isGlobal() { - return false; - } - - /** - * No indexes into log files yet. - */ - @Override - public boolean canIndexLogFiles() { - return false; - } - - /** - * Bloom filters are stored, into the same data files. - */ - @Override - public boolean isImplicitWithStorage() { - return true; - } - - /** - * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be - * checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files - * to be compared gets cut down a lot from range pruning. - *

- * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on - * recordKey ranges in the index info. - */ - List> explodeRecordsWithFileComparisons( - final Map> partitionToFileIndexInfo, - Map> partitionRecordKeyMap) { - IndexFileFilter indexFileFilter = - config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) - : new ListBasedIndexFileFilter(partitionToFileIndexInfo); - - List> fileRecordPairs = new ArrayList<>(); - partitionRecordKeyMap.keySet().forEach(partitionPath -> { - List hoodieRecordKeys = partitionRecordKeyMap.get(partitionPath); - hoodieRecordKeys.forEach(hoodieRecordKey -> { - indexFileFilter.getMatchingFilesAndPartition(partitionPath, hoodieRecordKey).forEach(partitionFileIdPair -> { - fileRecordPairs.add(Pair.of(partitionFileIdPair.getRight(), - new HoodieKey(hoodieRecordKey, partitionPath))); - }); - }); - }); - return fileRecordPairs; - } - - /** - * Find out pair. - */ - Map findMatchingFilesForRecordKeys( - List> fileComparisons, - HoodieTable hoodieTable) { - - fileComparisons = fileComparisons.stream().sorted((o1, o2) -> o1.getLeft().compareTo(o2.getLeft())).collect(toList()); - - List keyLookupResults = new ArrayList<>(); - - Iterator> iterator = new HoodieBaseBloomIndexCheckFunction(hoodieTable, config).apply(fileComparisons.iterator()); - while (iterator.hasNext()) { - keyLookupResults.addAll(iterator.next()); - } - - Map hoodieRecordLocationMap = new HashMap<>(); - - keyLookupResults = keyLookupResults.stream().filter(lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList()); - keyLookupResults.forEach(lookupResult -> { - lookupResult.getMatchingRecordKeys().forEach(r -> { - hoodieRecordLocationMap.put(new HoodieKey(r, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())); - }); - }); - - return hoodieRecordLocationMap; - } - - - /** - * Tag the back to the original HoodieRecord List. - */ - protected List> tagLocationBacktoRecords( - Map keyFilenamePair, List> records) { - Map> keyRecordPairMap = new HashMap<>(); - records.forEach(r -> keyRecordPairMap.put(r.getKey(), r)); - // Here as the record might have more data than rowKey (some rowKeys' fileId is null), - // so we do left outer join. - List, HoodieRecordLocation>> newList = new ArrayList<>(); - keyRecordPairMap.keySet().forEach(k -> { - if (keyFilenamePair.containsKey(k)) { - newList.add(Pair.of(keyRecordPairMap.get(k), keyFilenamePair.get(k))); - } else { - newList.add(Pair.of(keyRecordPairMap.get(k), null)); - } - }); - List> res = Lists.newArrayList(); - for (Pair, HoodieRecordLocation> v : newList) { - res.add(HoodieIndexUtils.getTaggedRecord(v.getLeft(), Option.ofNullable(v.getRight()))); - } - return res; - } - - @Override - public List updateLocation(List writeStatusList, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) { - return writeStatusList; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java new file mode 100644 index 0000000000000..a223d695cc03a --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.MetadataNotFoundException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.HoodieRangeInfoHandle; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static java.util.stream.Collectors.groupingBy; +import static java.util.stream.Collectors.mapping; +import static java.util.stream.Collectors.toList; +import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; + +/** + * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata. + */ +public class HoodieBloomIndex> + extends HoodieIndex { + private static final Logger LOG = LogManager.getLogger(HoodieBloomIndex.class); + + private final BaseHoodieBloomIndexHelper bloomIndexHelper; + + public HoodieBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelper bloomIndexHelper) { + super(config); + this.bloomIndexHelper = bloomIndexHelper; + } + + @Override + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + // Step 0: cache the input records if needed + if (config.getBloomIndexUseCaching()) { + records.persist(new HoodieConfig(config.getProps()) + .getString(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE)); + } + + // Step 1: Extract out thinner pairs of (partitionPath, recordKey) + HoodiePairData partitionRecordKeyPairs = records.mapToPair( + record -> new ImmutablePair<>(record.getPartitionPath(), record.getRecordKey())); + + // Step 2: Lookup indexes for all the partition/recordkey pair + HoodiePairData keyFilenamePairs = + lookupIndex(partitionRecordKeyPairs, context, hoodieTable); + + // Cache the result, for subsequent stages. + if (config.getBloomIndexUseCaching()) { + keyFilenamePairs.persist("MEMORY_AND_DISK_SER"); + } + if (LOG.isDebugEnabled()) { + long totalTaggedRecords = keyFilenamePairs.count(); + LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); + } + + // Step 3: Tag the incoming records, as inserts or updates, by joining with existing record keys + HoodieData> taggedRecords = tagLocationBacktoRecords(keyFilenamePairs, records); + + if (config.getBloomIndexUseCaching()) { + records.unpersist(); + keyFilenamePairs.unpersist(); + } + + return taggedRecords; + } + + /** + * Lookup the location for each record key and return the pair for all record keys already + * present and drop the record keys if not present. + */ + private HoodiePairData lookupIndex( + HoodiePairData partitionRecordKeyPairs, final HoodieEngineContext context, + final HoodieTable hoodieTable) { + // Obtain records per partition, in the incoming records + Map recordsPerPartition = partitionRecordKeyPairs.countByKey(); + List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); + + // Step 2: Load all involved files as pairs + List> fileInfoList = + loadInvolvedFiles(affectedPartitionPathList, context, hoodieTable); + final Map> partitionToFileInfo = + fileInfoList.stream().collect(groupingBy(Pair::getLeft, mapping(Pair::getRight, toList()))); + + // Step 3: Obtain a HoodieData, for each incoming record, that already exists, with the file id, + // that contains it. + HoodieData> fileComparisonPairs = + explodeRecordsWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairs); + + return bloomIndexHelper.findMatchingFilesForRecordKeys(config, context, hoodieTable, + partitionRecordKeyPairs, fileComparisonPairs, partitionToFileInfo, recordsPerPartition); + } + + /** + * Load all involved files as pair List. + */ + List> loadInvolvedFiles( + List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + // Obtain the latest data files from all the partitions. + List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() + .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) + .collect(toList()); + + if (config.getBloomIndexPruneByRanges()) { + // also obtain file ranges, if range pruning is enabled + context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); + return context.map(partitionPathFileIDList, pf -> { + try { + HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); + String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); + } catch (MetadataNotFoundException me) { + LOG.warn("Unable to find range metadata in file :" + pf); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); + } + }, Math.max(partitionPathFileIDList.size(), 1)); + } else { + return partitionPathFileIDList.stream() + .map(pf -> Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); + } + } + + @Override + public boolean rollbackCommit(String instantTime) { + // Nope, don't need to do anything. + return true; + } + + /** + * This is not global, since we depend on the partitionPath to do the lookup. + */ + @Override + public boolean isGlobal() { + return false; + } + + /** + * No indexes into log files yet. + */ + @Override + public boolean canIndexLogFiles() { + return false; + } + + /** + * Bloom filters are stored, into the same data files. + */ + @Override + public boolean isImplicitWithStorage() { + return true; + } + + /** + * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be + * checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files + * to be compared gets cut down a lot from range pruning. + *

+ * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on + * recordKey ranges in the index info. + */ + HoodieData> explodeRecordsWithFileComparisons( + final Map> partitionToFileIndexInfo, + HoodiePairData partitionRecordKeyPairs) { + IndexFileFilter indexFileFilter = + config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) + : new ListBasedIndexFileFilter(partitionToFileIndexInfo); + + return partitionRecordKeyPairs.map(partitionRecordKeyPair -> { + String recordKey = partitionRecordKeyPair.getRight(); + String partitionPath = partitionRecordKeyPair.getLeft(); + + return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() + .map(partitionFileIdPair -> new ImmutablePair<>(partitionFileIdPair.getRight(), + new HoodieKey(recordKey, partitionPath))) + .collect(Collectors.toList()); + }).flatMap(List::iterator); + } + + /** + * Tag the back to the original HoodieRecord List. + */ + protected HoodieData> tagLocationBacktoRecords( + HoodiePairData keyFilenamePair, + HoodieData> records) { + HoodiePairData> keyRecordPairs = + records.mapToPair(record -> new ImmutablePair<>(record.getKey(), record)); + // Here as the records might have more data than keyFilenamePairs (some row keys' fileId is null), + // so we do left outer join. + return keyRecordPairs.leftOuterJoin(keyFilenamePair).values() + .map(v -> HoodieIndexUtils.getTaggedRecord(v.getLeft(), Option.ofNullable(v.getRight().orElse(null)))); + } + + @Override + public HoodieData updateLocation( + HoodieData writeStatusData, HoodieEngineContext context, + HoodieTable hoodieTable) { + return writeStatusData; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieGlobalBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java similarity index 57% rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieGlobalBloomIndex.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java index 21b9d40601a78..39fa72a329fe3 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieGlobalBloomIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java @@ -7,17 +7,20 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.index.bloom; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; @@ -27,38 +30,33 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndexUtils; import org.apache.hudi.table.HoodieTable; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.Optional; - import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import scala.Tuple2; - /** - * This filter will only work with hoodie table since it will only load partitions with .hoodie_partition_metadata - * file in it. + * This filter will only work with hoodie table since it will only load partitions + * with .hoodie_partition_metadata file in it. */ -public class SparkHoodieGlobalBloomIndex extends SparkHoodieBloomIndex { - - public SparkHoodieGlobalBloomIndex(HoodieWriteConfig config) { - super(config); +public class HoodieGlobalBloomIndex> extends HoodieBloomIndex { + public HoodieGlobalBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelper bloomIndexHelper) { + super(config, bloomIndexHelper); } /** - * Load all involved files as pair RDD from all partitions in the table. + * Load all involved files as pairs from all partitions in the table. */ @Override - List> loadInvolvedFiles(List partitions, final HoodieEngineContext context, - final HoodieTable hoodieTable) { + List> loadInvolvedFiles(List partitions, final HoodieEngineContext context, + final HoodieTable hoodieTable) { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); List allPartitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable); @@ -70,25 +68,25 @@ List> loadInvolvedFiles(List partitio * to be compared gets cut down a lot from range pruning. *

* Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on - * recordKey ranges in the index info. the partition path of the incoming record (partitionRecordKeyPairRDD._2()) will + * recordKey ranges in the index info. the partition path of the incoming record (partitionRecordKeyPairs._2()) will * be ignored since the search scope should be bigger than that */ @Override - JavaRDD> explodeRecordRDDWithFileComparisons( + HoodieData> explodeRecordsWithFileComparisons( final Map> partitionToFileIndexInfo, - JavaPairRDD partitionRecordKeyPairRDD) { + HoodiePairData partitionRecordKeyPairs) { IndexFileFilter indexFileFilter = config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo) : new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo); - return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { - String recordKey = partitionRecordKeyPair._2(); - String partitionPath = partitionRecordKeyPair._1(); + return partitionRecordKeyPairs.map(partitionRecordKeyPair -> { + String recordKey = partitionRecordKeyPair.getRight(); + String partitionPath = partitionRecordKeyPair.getLeft(); return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() - .map(partitionFileIdPair -> new Tuple2<>(partitionFileIdPair.getRight(), + .map(partitionFileIdPair -> new ImmutablePair<>(partitionFileIdPair.getRight(), new HoodieKey(recordKey, partitionFileIdPair.getLeft()))) .collect(Collectors.toList()); }).flatMap(List::iterator); @@ -98,27 +96,29 @@ JavaRDD> explodeRecordRDDWithFileComparisons( * Tagging for global index should only consider the record key. */ @Override - protected JavaRDD> tagLocationBacktoRecords( - JavaPairRDD keyLocationPairRDD, JavaRDD> recordRDD) { + protected HoodieData> tagLocationBacktoRecords( + HoodiePairData keyLocationPairs, + HoodieData> records) { - JavaPairRDD> incomingRowKeyRecordPairRDD = - recordRDD.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); + HoodiePairData> incomingRowKeyRecordPairs = + records.mapToPair(record -> new ImmutablePair<>(record.getRecordKey(), record)); - JavaPairRDD> existingRecordKeyToRecordLocationHoodieKeyMap = - keyLocationPairRDD.mapToPair(p -> new Tuple2<>(p._1.getRecordKey(), new Tuple2<>(p._2, p._1))); + HoodiePairData> existingRecordKeyToRecordLocationHoodieKeyMap = + keyLocationPairs.mapToPair(p -> new ImmutablePair<>( + p.getKey().getRecordKey(), new ImmutablePair<>(p.getValue(), p.getKey()))); - // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join. - return incomingRowKeyRecordPairRDD.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> { - final HoodieRecord hoodieRecord = record._1; - final Optional> recordLocationHoodieKeyPair = record._2; + // Here as the records might have more data than rowKeys (some rowKeys' fileId is null), so we do left outer join. + return incomingRowKeyRecordPairs.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> { + final HoodieRecord hoodieRecord = record.getLeft(); + final Option> recordLocationHoodieKeyPair = record.getRight(); if (recordLocationHoodieKeyPair.isPresent()) { // Record key matched to file if (config.getBloomIndexUpdatePartitionPath() - && !recordLocationHoodieKeyPair.get()._2.getPartitionPath().equals(hoodieRecord.getPartitionPath())) { + && !recordLocationHoodieKeyPair.get().getRight().getPartitionPath().equals(hoodieRecord.getPartitionPath())) { // Create an empty record to delete the record in the old partition - HoodieRecord deleteRecord = new HoodieRecord(recordLocationHoodieKeyPair.get()._2, + HoodieRecord deleteRecord = new HoodieRecord(recordLocationHoodieKeyPair.get().getRight(), new EmptyHoodieRecordPayload()); - deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get()._1()); + deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get().getLeft()); deleteRecord.seal(); // Tag the incoming record for inserting to the new partition HoodieRecord insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty()); @@ -127,8 +127,8 @@ protected JavaRDD> tagLocationBacktoRecords( // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. // When it differs, the record will still be updated at its old partition. return Collections.singletonList( - (HoodieRecord) HoodieIndexUtils.getTaggedRecord(new HoodieRecord<>(recordLocationHoodieKeyPair.get()._2, hoodieRecord.getData()), - Option.ofNullable(recordLocationHoodieKeyPair.get()._1))).iterator(); + (HoodieRecord) HoodieIndexUtils.getTaggedRecord(new HoodieRecord<>(recordLocationHoodieKeyPair.get().getRight(), hoodieRecord.getData()), + Option.ofNullable(recordLocationHoodieKeyPair.get().getLeft()))).iterator(); } } else { return Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java new file mode 100644 index 0000000000000..74191df523659 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieKeyLookupHandle; +import org.apache.hudi.table.HoodieTable; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static java.util.stream.Collectors.toList; + +/** + * Helper for {@link HoodieBloomIndex} containing Java {@link List}-based logic. + */ +public class ListBasedHoodieBloomIndexHelper extends BaseHoodieBloomIndexHelper { + + private static final ListBasedHoodieBloomIndexHelper SINGLETON_INSTANCE = new ListBasedHoodieBloomIndexHelper(); + + protected ListBasedHoodieBloomIndexHelper() { + } + + public static ListBasedHoodieBloomIndexHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodiePairData findMatchingFilesForRecordKeys( + HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, + HoodiePairData partitionRecordKeyPairs, + HoodieData> fileComparisonPairs, + Map> partitionToFileInfo, Map recordsPerPartition) { + List> fileComparisonPairList = + HoodieList.getList(fileComparisonPairs).stream() + .sorted(Comparator.comparing(ImmutablePair::getLeft)).collect(toList()); + + List keyLookupResults = new ArrayList<>(); + + Iterator> iterator = new HoodieBaseBloomIndexCheckFunction( + hoodieTable, config).apply(fileComparisonPairList.iterator()); + while (iterator.hasNext()) { + keyLookupResults.addAll(iterator.next()); + } + + keyLookupResults = keyLookupResults.stream().filter( + lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList()); + return context.parallelize(keyLookupResults).flatMap(lookupResult -> + lookupResult.getMatchingRecordKeys().stream() + .map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator() + ).mapToPair(pair -> { + HoodieKeyLookupHandle.KeyLookupResult lookupResult = pair.getLeft(); + String recordKey = pair.getRight(); + return new ImmutablePair<>( + new HoodieKey(recordKey, lookupResult.getPartitionPath()), + new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())); + }); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaInMemoryHashIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java similarity index 59% rename from hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaInMemoryHashIndex.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java index 8a7197759aa0a..bec675c102ff5 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaInMemoryHashIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java @@ -7,18 +7,20 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ -package org.apache.hudi.index; +package org.apache.hudi.index.inmemory; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -26,27 +28,27 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import java.util.stream.Collectors; /** * Hoodie Index implementation backed by an in-memory Hash map. *

* ONLY USE FOR LOCAL TESTING */ -@SuppressWarnings("checkstyle:LineLength") -public class JavaInMemoryHashIndex extends JavaHoodieIndex { +public class HoodieInMemoryHashIndex> + extends HoodieIndex { private static ConcurrentMap recordLocationMap; - public JavaInMemoryHashIndex(HoodieWriteConfig config) { + public HoodieInMemoryHashIndex(HoodieWriteConfig config) { super(config); - synchronized (JavaInMemoryHashIndex.class) { + synchronized (HoodieInMemoryHashIndex.class) { if (recordLocationMap == null) { recordLocationMap = new ConcurrentHashMap<>(); } @@ -54,25 +56,29 @@ public JavaInMemoryHashIndex(HoodieWriteConfig config) { } @Override - public List> tagLocation(List> records, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) { - List> taggedRecords = new ArrayList<>(); - records.stream().forEach(record -> { - if (recordLocationMap.containsKey(record.getKey())) { - record.unseal(); - record.setCurrentLocation(recordLocationMap.get(record.getKey())); - record.seal(); + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return records.mapPartitions(hoodieRecordIterator -> { + List> taggedRecords = new ArrayList<>(); + while (hoodieRecordIterator.hasNext()) { + HoodieRecord record = hoodieRecordIterator.next(); + if (recordLocationMap.containsKey(record.getKey())) { + record.unseal(); + record.setCurrentLocation(recordLocationMap.get(record.getKey())); + record.seal(); + } + taggedRecords.add(record); } - taggedRecords.add(record); - }); - return taggedRecords; + return taggedRecords.iterator(); + }, true); } @Override - public List updateLocation(List writeStatusList, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) { - return writeStatusList.stream().map(writeStatus -> { + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) { + return writeStatuses.map(writeStatus -> { for (HoodieRecord record : writeStatus.getWrittenRecords()) { if (!writeStatus.isErrored(record.getKey())) { HoodieKey key = record.getKey(); @@ -86,7 +92,7 @@ public List updateLocation(List writeStatusList, } } return writeStatus; - }).collect(Collectors.toList()); + }); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieGlobalSimpleIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java similarity index 58% rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieGlobalSimpleIndex.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java index 63e13c463ab44..8935fcb02fec2 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieGlobalSimpleIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java @@ -7,18 +7,20 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.index.simple; -import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; @@ -29,13 +31,12 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.table.HoodieTable; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import scala.Tuple2; import java.util.Arrays; import java.util.Collections; @@ -49,57 +50,57 @@ * * @param */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkHoodieGlobalSimpleIndex extends SparkHoodieSimpleIndex { - - public SparkHoodieGlobalSimpleIndex(HoodieWriteConfig config) { - super(config); +public class HoodieGlobalSimpleIndex> extends HoodieSimpleIndex { + public HoodieGlobalSimpleIndex(HoodieWriteConfig config, Option keyGeneratorOpt) { + super(config, keyGeneratorOpt); } @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return tagLocationInternal(recordRDD, context, hoodieTable); + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return tagLocationInternal(records, context, hoodieTable); } /** * Tags records location for incoming records. * - * @param inputRecordRDD {@link JavaRDD} of incoming records - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} to use - * @return {@link JavaRDD} of records with record locations set + * @param inputRecords {@link HoodieData} of incoming records + * @param context instance of {@link HoodieEngineContext} to use + * @param hoodieTable instance of {@link HoodieTable} to use + * @return {@link HoodieData} of records with record locations set */ @Override - protected JavaRDD> tagLocationInternal(JavaRDD> inputRecordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { + protected HoodieData> tagLocationInternal( + HoodieData> inputRecords, HoodieEngineContext context, + HoodieTable hoodieTable) { - JavaPairRDD> keyedInputRecordRDD = inputRecordRDD.mapToPair(entry -> new Tuple2<>(entry.getRecordKey(), entry)); - JavaPairRDD allRecordLocationsInTable = fetchAllRecordLocations(context, hoodieTable, - config.getGlobalSimpleIndexParallelism()); - return getTaggedRecords(keyedInputRecordRDD, allRecordLocationsInTable); + HoodiePairData> keyedInputRecords = + inputRecords.mapToPair(entry -> new ImmutablePair<>(entry.getRecordKey(), entry)); + HoodiePairData allRecordLocationsInTable = + fetchAllRecordLocations(context, hoodieTable, config.getGlobalSimpleIndexParallelism()); + return getTaggedRecords(keyedInputRecords, allRecordLocationsInTable); } /** * Fetch record locations for passed in {@link HoodieKey}s. * - * @param context instance of {@link HoodieEngineContext} to use + * @param context instance of {@link HoodieEngineContext} to use * @param hoodieTable instance of {@link HoodieTable} of interest * @param parallelism parallelism to use - * @return {@link JavaPairRDD} of {@link HoodieKey} and {@link HoodieRecordLocation} + * @return {@link HoodiePairData} of {@link HoodieKey} and {@link HoodieRecordLocation} */ - protected JavaPairRDD fetchAllRecordLocations(HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - int parallelism) { + protected HoodiePairData fetchAllRecordLocations( + HoodieEngineContext context, HoodieTable hoodieTable, int parallelism) { List> latestBaseFiles = getAllBaseFilesInTable(context, hoodieTable); return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); } /** - * Load all files for all partitions as pair RDD. + * Load all files for all partitions as pair data. */ - protected List> getAllBaseFilesInTable(final HoodieEngineContext context, - final HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { + protected List> getAllBaseFilesInTable( + final HoodieEngineContext context, final HoodieTable hoodieTable) { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); List allPartitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); // Obtain the latest data files from all the partitions. @@ -111,16 +112,20 @@ protected List> getAllBaseFilesInTable(final Hoodie * * @param incomingRecords incoming {@link HoodieRecord}s * @param existingRecords existing records with {@link HoodieRecordLocation}s - * @return {@link JavaRDD} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s + * @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s */ - private JavaRDD> getTaggedRecords(JavaPairRDD> incomingRecords, JavaPairRDD existingRecords) { - JavaPairRDD> existingRecordByRecordKey = existingRecords - .mapToPair(entry -> new Tuple2<>(entry._1.getRecordKey(), Pair.of(entry._1.getPartitionPath(), entry._2))); + private HoodieData> getTaggedRecords( + HoodiePairData> incomingRecords, + HoodiePairData existingRecords) { + HoodiePairData> existingRecordByRecordKey = + existingRecords.mapToPair( + entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), + Pair.of(entry.getLeft().getPartitionPath(), entry.getRight()))); return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values() .flatMap(entry -> { - HoodieRecord inputRecord = entry._1; - Option> partitionPathLocationPair = Option.ofNullable(entry._2.orNull()); + HoodieRecord inputRecord = entry.getLeft(); + Option> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null)); List> taggedRecords; if (partitionPathLocationPair.isPresent()) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java new file mode 100644 index 0000000000000..dfefe5adabfe9 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.simple; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.HoodieKeyLocationFetchHandle; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.table.HoodieTable; + +import java.util.List; + +import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; + +/** + * A simple index which reads interested fields(record key and partition path) from base files and + * joins with incoming records to find the tagged location. + * + * @param type of {@link HoodieRecordPayload} + */ +public class HoodieSimpleIndex> + extends HoodieIndex { + + private final Option keyGeneratorOpt; + + public HoodieSimpleIndex(HoodieWriteConfig config, Option keyGeneratorOpt) { + super(config); + this.keyGeneratorOpt = keyGeneratorOpt; + } + + @Override + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) { + return writeStatuses; + } + + @Override + public boolean rollbackCommit(String commitTime) { + return true; + } + + @Override + public boolean isGlobal() { + return false; + } + + @Override + public boolean canIndexLogFiles() { + return false; + } + + @Override + public boolean isImplicitWithStorage() { + return true; + } + + @Override + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return tagLocationInternal(records, context, hoodieTable); + } + + /** + * Tags records location for incoming records. + * + * @param inputRecords {@link HoodieData} of incoming records + * @param context instance of {@link HoodieEngineContext} to use + * @param hoodieTable instance of {@link HoodieTable} to use + * @return {@link HoodieData} of records with record locations set + */ + protected HoodieData> tagLocationInternal( + HoodieData> inputRecords, HoodieEngineContext context, + HoodieTable hoodieTable) { + if (config.getSimpleIndexUseCaching()) { + inputRecords.persist(new HoodieConfig(config.getProps()) + .getString(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE)); + } + + HoodiePairData> keyedInputRecords = + inputRecords.mapToPair(record -> new ImmutablePair<>(record.getKey(), record)); + HoodiePairData existingLocationsOnTable = + fetchRecordLocationsForAffectedPartitions(keyedInputRecords.keys(), context, hoodieTable, + config.getSimpleIndexParallelism()); + + HoodieData> taggedRecords = + keyedInputRecords.leftOuterJoin(existingLocationsOnTable).map(entry -> { + final HoodieRecord untaggedRecord = entry.getRight().getLeft(); + final Option location = Option.ofNullable(entry.getRight().getRight().orElse(null)); + return HoodieIndexUtils.getTaggedRecord(untaggedRecord, location); + }); + + if (config.getSimpleIndexUseCaching()) { + inputRecords.unpersist(); + } + return taggedRecords; + } + + /** + * Fetch record locations for passed in {@link HoodieKey}s. + * + * @param hoodieKeys {@link HoodieData} of {@link HoodieKey}s for which locations are fetched + * @param context instance of {@link HoodieEngineContext} to use + * @param hoodieTable instance of {@link HoodieTable} of interest + * @param parallelism parallelism to use + * @return {@link HoodiePairData} of {@link HoodieKey} and {@link HoodieRecordLocation} + */ + protected HoodiePairData fetchRecordLocationsForAffectedPartitions( + HoodieData hoodieKeys, HoodieEngineContext context, HoodieTable hoodieTable, + int parallelism) { + List affectedPartitionPathList = + hoodieKeys.map(HoodieKey::getPartitionPath).distinct().collectAsList(); + List> latestBaseFiles = + getLatestBaseFilesForAllPartitions(affectedPartitionPathList, context, hoodieTable); + return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); + } + + protected HoodiePairData fetchRecordLocations( + HoodieEngineContext context, HoodieTable hoodieTable, int parallelism, + List> baseFiles) { + int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism)); + + return context.parallelize(baseFiles, fetchParallelism) + .flatMap(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile, keyGeneratorOpt) + .locations().iterator()) + .mapToPair(e -> (Pair) e); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index 1315c99406ab6..a33383a05c02d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -196,7 +196,7 @@ private Option getIndexedRecord(HoodieRecord hoodieRecord) { Option> recordMetadata = hoodieRecord.getData().getMetadata(); try { // Pass the isUpdateRecord to the props for HoodieRecordPayload to judge - // Whether it is a update or insert record. + // Whether it is an update or insert record. boolean isUpdateRecord = isUpdateRecord(hoodieRecord); // If the format can not record the operation field, nullify the DELETE payload manually. boolean nullifyPayload = HoodieOperation.isDelete(hoodieRecord.getOperation()) && !config.allowOperationMetadataField(); @@ -219,7 +219,7 @@ private Option getIndexedRecord(HoodieRecord hoodieRecord) { if (config.allowOperationMetadataField()) { HoodieAvroUtils.addOperationToRecord(rewriteRecord, hoodieRecord.getOperation()); } - if (isUpdateRecord(hoodieRecord)) { + if (isUpdateRecord) { updatedRecordsWritten++; } else { insertRecordsWritten++; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieConcatHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java similarity index 65% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieConcatHandle.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java index 0400608860496..c33c0f08ca830 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieConcatHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java @@ -16,15 +16,16 @@ * limitations under the License. */ -package org.apache.hudi.io.storage; +package org.apache.hudi.io; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieUpsertException; -import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.table.HoodieTable; @@ -34,6 +35,7 @@ import org.apache.log4j.Logger; import java.io.IOException; +import java.util.Collections; import java.util.Iterator; import java.util.Map; @@ -44,21 +46,21 @@ * Simplified Logic: * For every existing record * Write the record as is - * For all incoming records, write to file as is. + * For all incoming records, write to file as is, without de-duplicating based on the record key. * * Illustration with simple data. * Incoming data: - * rec1_2, rec4_2, rec5_1, rec6_1 + * rec1_2, rec1_3, rec4_2, rec5_1, rec6_1 * Existing data: * rec1_1, rec2_1, rec3_1, rec4_1 * * For every existing record, write to storage as is. * => rec1_1, rec2_1, rec3_1 and rec4_1 is written to storage * Write all records from incoming set to storage - * => rec1_2, rec4_2, rec5_1 and rec6_1 + * => rec1_2, rec1_3, rec4_2, rec5_1 and rec6_1 * * Final snapshot in storage - * rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec4_2, rec5_1, rec6_1 + * rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec1_3, rec4_2, rec5_1, rec6_1 * * Users should ensure there are no duplicates when "insert" operation is used and if the respective config is enabled. So, above scenario should not * happen and every batch should have new records to be inserted. Above example is for illustration purposes only. @@ -66,16 +68,22 @@ public class HoodieConcatHandle extends HoodieMergeHandle { private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class); + // a representation of incoming records that tolerates duplicate keys + private final Iterator> recordItr; - public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator recordItr, - String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, Option keyGeneratorOpt) { - super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt); + public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier, Option keyGeneratorOpt) { + super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier, keyGeneratorOpt); + this.recordItr = recordItr; } - public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map keyToNewRecords, String partitionPath, String fileId, - HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) { - super(config, instantTime, hoodieTable, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, + public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Map> keyToNewRecords, String partitionPath, String fileId, + HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) { + super(config, instantTime, hoodieTable, Collections.emptyMap(), partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, Option.empty()); + this.recordItr = keyToNewRecords.values().iterator(); } /** @@ -94,4 +102,17 @@ public void write(GenericRecord oldRecord) { } recordsWritten++; } + + @Override + protected void writeIncomingRecords() throws IOException { + while (recordItr.hasNext()) { + HoodieRecord record = recordItr.next(); + if (needsUpdateLocation()) { + record.unseal(); + record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); + record.seal(); + } + writeInsertRecord(record); + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index b01d62f1a481d..b13b561f5dff1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -177,7 +177,7 @@ private void init(String fileId, String partitionPath, HoodieBaseFile baseFileTo writeStatus.setPartitionPath(partitionPath); writeStatus.getStat().setPartitionPath(partitionPath); writeStatus.getStat().setFileId(fileId); - writeStatus.getStat().setPath(new Path(config.getBasePath()), newFilePath); + setWriteStatusPath(); // Create Marker file createMarkerFile(partitionPath, newFileName); @@ -257,6 +257,18 @@ private boolean writeUpdateRecord(HoodieRecord hoodieRecord, Option hoodieRecord) throws IOException { + Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema; + Option insertRecord = hoodieRecord.getData().getInsertValue(schema, config.getProps()); + // just skip the ignored record + if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) { + return; + } + if (writeRecord(hoodieRecord, insertRecord)) { + insertRecordsWritten++; + } + } + protected boolean writeRecord(HoodieRecord hoodieRecord, Option indexedRecord) { Option recordMetadata = hoodieRecord.getData().getMetadata(); if (!partitionPath.equals(hoodieRecord.getPartitionPath())) { @@ -340,28 +352,28 @@ public void write(GenericRecord oldRecord) { } } + protected void writeIncomingRecords() throws IOException { + // write out any pending records (this can happen when inserts are turned into updates) + Iterator> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap) + ? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator(); + while (newRecordsItr.hasNext()) { + HoodieRecord hoodieRecord = newRecordsItr.next(); + if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) { + writeInsertRecord(hoodieRecord); + } + } + } + @Override public List close() { try { - // write out any pending records (this can happen when inserts are turned into updates) - Iterator> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap) - ? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator(); - while (newRecordsItr.hasNext()) { - HoodieRecord hoodieRecord = newRecordsItr.next(); - if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) { - Schema schema = useWriterSchema ? tableSchemaWithMetaFields : tableSchema; - Option insertRecord = - hoodieRecord.getData().getInsertValue(schema, config.getProps()); - // just skip the ignore record - if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) { - continue; - } - writeRecord(hoodieRecord, insertRecord); - insertRecordsWritten++; - } - } + writeIncomingRecords(); - ((ExternalSpillableMap) keyToNewRecords).close(); + if (keyToNewRecords instanceof ExternalSpillableMap) { + ((ExternalSpillableMap) keyToNewRecords).close(); + } else { + keyToNewRecords.clear(); + } writtenRecordKeys.clear(); if (fileWriter != null) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java index 763178dbf6054..606e63a340e9d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java @@ -90,9 +90,9 @@ public void write(GenericRecord oldRecord) { } try { if (useWriterSchema) { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchemaWithMetaFields)); + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchemaWithMetaFields, config.getProps())); } else { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchema)); + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchema, config.getProps())); } insertRecordsWritten++; writtenRecordKeys.add(keyToPreWrite); @@ -112,9 +112,9 @@ public List close() { HoodieRecord hoodieRecord = keyToNewRecords.get(key); if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) { if (useWriterSchema) { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchemaWithMetaFields)); + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchemaWithMetaFields, config.getProps())); } else { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchema)); + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchema, config.getProps())); } insertRecordsWritten++; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java index f934a8a83784f..1a10e6a716cd6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java @@ -33,4 +33,10 @@ public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionC double compressionRatio) { super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio); } + + public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName, + int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, + double compressionRatio, boolean directoryEnabled) { + super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, directoryEnabled); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java index 6e6f66c5eac6d..6db1de012c240 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java @@ -27,15 +27,21 @@ */ public class HoodieBaseParquetConfig { private final T writeSupport; - private CompressionCodecName compressionCodecName; - private int blockSize; - private int pageSize; - private long maxFileSize; - private Configuration hadoopConf; - private double compressionRatio; + private final CompressionCodecName compressionCodecName; + private final int blockSize; + private final int pageSize; + private final long maxFileSize; + private final Configuration hadoopConf; + private final double compressionRatio; + private final boolean dictionaryEnabled; public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize, - int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) { + int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) { + this(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, false); + } + + public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize, + int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio, boolean dictionaryEnabled) { this.writeSupport = writeSupport; this.compressionCodecName = compressionCodecName; this.blockSize = blockSize; @@ -43,6 +49,7 @@ public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionC this.maxFileSize = maxFileSize; this.hadoopConf = hadoopConf; this.compressionRatio = compressionRatio; + this.dictionaryEnabled = dictionaryEnabled; } public CompressionCodecName getCompressionCodecName() { @@ -72,4 +79,8 @@ public double getCompressionRatio() { public T getWriteSupport() { return writeSupport; } + + public boolean dictionaryEnabled() { + return dictionaryEnabled; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java index 56022c94edefc..e88c34f608d3e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java @@ -71,7 +71,7 @@ private static HoodieFi HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(), config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), - hoodieTable.getHadoopConf(), config.getParquetCompressionRatio()); + hoodieTable.getHadoopConf(), config.getParquetCompressionRatio(), config.parquetDictionaryEnabled()); return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java index e7328fb50c3bf..4f51de35d24a9 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java @@ -56,7 +56,7 @@ public HoodieParquetWriter(String instantTime, Path file, HoodieAvroParquetConfi super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(), - DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED, + parquetConfig.dictionaryEnabled(), DEFAULT_IS_VALIDATING_ENABLED, DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf())); this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); this.fs = diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java index 4923d980c9cb3..8038afe4e5687 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java @@ -40,7 +40,7 @@ public class KeyGenUtils { protected static final String NULL_RECORDKEY_PLACEHOLDER = "__null__"; protected static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__"; - protected static final String DEFAULT_PARTITION_PATH = "default"; + protected static final String HUDI_DEFAULT_PARTITION_PATH = PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH; public static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/"; /** @@ -121,8 +121,8 @@ public static String getRecordPartitionPath(GenericRecord record, List p for (String partitionPathField : partitionPathFields) { String fieldVal = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true); if (fieldVal == null || fieldVal.isEmpty()) { - partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + DEFAULT_PARTITION_PATH - : DEFAULT_PARTITION_PATH); + partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + HUDI_DEFAULT_PARTITION_PATH + : HUDI_DEFAULT_PARTITION_PATH); } else { if (encodePartitionPath) { fieldVal = PartitionPathEncodeUtils.escapePathName(fieldVal); @@ -147,7 +147,7 @@ public static String getPartitionPath(GenericRecord record, String partitionPath boolean hiveStylePartitioning, boolean encodePartitionPath) { String partitionPath = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true); if (partitionPath == null || partitionPath.isEmpty()) { - partitionPath = DEFAULT_PARTITION_PATH; + partitionPath = HUDI_DEFAULT_PARTITION_PATH; } if (encodePartitionPath) { partitionPath = PartitionPathEncodeUtils.escapePathName(partitionPath); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index ab5fa994deb76..48d6b948c4133 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -18,11 +18,13 @@ package org.apache.hudi.metadata; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.avro.model.HoodieCleanMetadata; -import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieInstantInfo; import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.client.AbstractHoodieWriteClient; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -32,12 +34,17 @@ import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; @@ -46,10 +53,11 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieMetricsConfig; +import org.apache.hudi.config.metrics.HoodieMetricsConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hadoop.conf.Configuration; @@ -61,6 +69,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; @@ -81,19 +90,31 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta private static final Logger LOG = LogManager.getLogger(HoodieBackedTableMetadataWriter.class); protected HoodieWriteConfig metadataWriteConfig; - protected HoodieWriteConfig datasetWriteConfig; + protected HoodieWriteConfig dataWriteConfig; protected String tableName; protected HoodieBackedTableMetadata metadata; - protected HoodieTableMetaClient metaClient; + protected HoodieTableMetaClient metadataMetaClient; + protected HoodieTableMetaClient dataMetaClient; protected Option metrics; protected boolean enabled; protected SerializableConfiguration hadoopConf; protected final transient HoodieEngineContext engineContext; - protected HoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteConfig writeConfig, - HoodieEngineContext engineContext) { - this.datasetWriteConfig = writeConfig; + /** + * Hudi backed table metadata writer. + * + * @param hadoopConf - Hadoop configuration to use for the metadata writer + * @param writeConfig - Writer config + * @param engineContext - Engine context + * @param actionMetadata - Optional action metadata to help decide bootstrap operations + * @param - Action metadata types extending Avro generated SpecificRecordBase + */ + protected HoodieBackedTableMetadataWriter(Configuration hadoopConf, + HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext, + Option actionMetadata) { + this.dataWriteConfig = writeConfig; this.engineContext = engineContext; this.hadoopConf = new SerializableConfiguration(hadoopConf); @@ -103,24 +124,21 @@ protected HoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteC enabled = true; // Inline compaction and auto clean is required as we dont expose this table outside - ValidationUtils.checkArgument(!this.metadataWriteConfig.isAutoClean(), "Cleaning is controlled internally for Metadata table."); - ValidationUtils.checkArgument(!this.metadataWriteConfig.inlineCompactionEnabled(), "Compaction is controlled internally for metadata table."); + ValidationUtils.checkArgument(!this.metadataWriteConfig.isAutoClean(), + "Cleaning is controlled internally for Metadata table."); + ValidationUtils.checkArgument(!this.metadataWriteConfig.inlineCompactionEnabled(), + "Compaction is controlled internally for metadata table."); // Metadata Table cannot have metadata listing turned on. (infinite loop, much?) - ValidationUtils.checkArgument(this.metadataWriteConfig.shouldAutoCommit(), "Auto commit is required for Metadata Table"); - ValidationUtils.checkArgument(!this.metadataWriteConfig.isMetadataTableEnabled(), "File listing cannot be used for Metadata Table"); + ValidationUtils.checkArgument(this.metadataWriteConfig.shouldAutoCommit(), + "Auto commit is required for Metadata Table"); + ValidationUtils.checkArgument(!this.metadataWriteConfig.isMetadataTableEnabled(), + "File listing cannot be used for Metadata Table"); initRegistry(); - HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(datasetWriteConfig.getBasePath()).build(); - initialize(engineContext, datasetMetaClient); - if (enabled) { - // This is always called even in case the table was created for the first time. This is because - // initFromFilesystem() does file listing and hence may take a long time during which some new updates - // may have occurred on the table. Hence, calling this always ensures that the metadata is brought in sync - // with the active timeline. - HoodieTimer timer = new HoodieTimer().startTimer(); - syncFromInstants(datasetMetaClient); - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.SYNC_STR, timer.endTimer())); - } + this.dataMetaClient = + HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(dataWriteConfig.getBasePath()).build(); + initialize(engineContext, actionMetadata); + initTableMetadata(); } else { enabled = false; this.metrics = Option.empty(); @@ -163,7 +181,7 @@ private HoodieWriteConfig createMetadataWriteConfig(HoodieWriteConfig writeConfi .withAutoClean(false) .withCleanerParallelism(parallelism) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) - .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) .retainCommits(writeConfig.getMetadataCleanerCommitsRetained()) .archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep) // we will trigger compaction manually, to control the instant times @@ -172,22 +190,26 @@ private HoodieWriteConfig createMetadataWriteConfig(HoodieWriteConfig writeConfi .withParallelism(parallelism, parallelism) .withDeleteParallelism(parallelism) .withRollbackParallelism(parallelism) - .withFinalizeWriteParallelism(parallelism); + .withFinalizeWriteParallelism(parallelism) + .withAllowMultiWriteOnSameInstant(true); if (writeConfig.isMetricsOn()) { - HoodieMetricsConfig.Builder metricsConfig = HoodieMetricsConfig.newBuilder() + builder.withMetricsConfig(HoodieMetricsConfig.newBuilder() .withReporterType(writeConfig.getMetricsReporterType().toString()) .withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()) - .on(true); + .on(true).build()); switch (writeConfig.getMetricsReporterType()) { case GRAPHITE: - metricsConfig.onGraphitePort(writeConfig.getGraphiteServerPort()) + builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() + .onGraphitePort(writeConfig.getGraphiteServerPort()) .toGraphiteHost(writeConfig.getGraphiteServerHost()) - .usePrefix(writeConfig.getGraphiteMetricPrefix()); + .usePrefix(writeConfig.getGraphiteMetricPrefix()).build()); break; case JMX: - metricsConfig.onJmxPort(writeConfig.getJmxPort()) - .toJmxHost(writeConfig.getJmxHost()); + builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder() + .onJmxPort(writeConfig.getJmxPort()) + .toJmxHost(writeConfig.getJmxHost()) + .build()); break; case DATADOG: case PROMETHEUS: @@ -198,10 +220,7 @@ private HoodieWriteConfig createMetadataWriteConfig(HoodieWriteConfig writeConfi default: throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType()); } - - builder.withMetricsConfig(metricsConfig.build()); } - return builder.build(); } @@ -214,87 +233,136 @@ public HoodieBackedTableMetadata metadata() { } /** - * Initialize the metadata table if it does not exist. Update the metadata to bring it in sync with the file system. - * - * This can happen in two ways: - * 1. If the metadata table did not exist, then file and partition listing is used - * 2. If the metadata table exists, the instants from active timeline are read in order and changes applied - * - * The above logic has been chosen because it is faster to perform #1 at scale rather than read all the Instants - * which are large in size (AVRO or JSON encoded and not compressed) and incur considerable IO for de-serialization - * and decoding. + * Initialize the metadata table if it does not exist. + *

+ * If the metadata table does not exist, then file and partition listing is used to bootstrap the table. */ - protected abstract void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient); + protected abstract void initialize(HoodieEngineContext engineContext, + Option actionMetadata); - protected void initTableMetadata() { + public void initTableMetadata() { try { if (this.metadata != null) { this.metadata.close(); } - this.metadata = new HoodieBackedTableMetadata(engineContext, datasetWriteConfig.getMetadataConfig(), - datasetWriteConfig.getBasePath(), datasetWriteConfig.getSpillableMapBasePath()); - this.metaClient = metadata.getMetaClient(); + this.metadata = new HoodieBackedTableMetadata(engineContext, dataWriteConfig.getMetadataConfig(), + dataWriteConfig.getBasePath(), dataWriteConfig.getSpillableMapBasePath()); + this.metadataMetaClient = metadata.getMetadataMetaClient(); } catch (Exception e) { throw new HoodieException("Error initializing metadata table for reads", e); } } - protected void bootstrapIfNeeded(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient) throws IOException { + /** + * Bootstrap the metadata table if needed. + * + * @param engineContext - Engine context + * @param dataMetaClient - Meta client for the data table + * @param actionMetadata - Optional action metadata + * @param - Action metadata types extending Avro generated SpecificRecordBase + * @throws IOException + */ + protected void bootstrapIfNeeded(HoodieEngineContext engineContext, + HoodieTableMetaClient dataMetaClient, + Option actionMetadata) throws IOException { HoodieTimer timer = new HoodieTimer().startTimer(); - boolean exists = datasetMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(), HoodieTableMetaClient.METAFOLDER_NAME)); + + boolean exists = dataMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(), + HoodieTableMetaClient.METAFOLDER_NAME)); boolean rebootstrap = false; + + // If the un-synced instants have been archived, then + // the metadata table will need to be bootstrapped again. if (exists) { - // If the un-synched instants have been archived then the metadata table will need to be bootstrapped again - HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()) + final HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()) .setBasePath(metadataWriteConfig.getBasePath()).build(); - Option latestMetadataInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant(); - if (!latestMetadataInstant.isPresent()) { - LOG.warn("Metadata Table will need to be re-bootstrapped as no instants were found"); - rebootstrap = true; - } else if (!latestMetadataInstant.get().getTimestamp().equals(SOLO_COMMIT_TIMESTAMP) - && datasetMetaClient.getActiveTimeline().isBeforeTimelineStarts(latestMetadataInstant.get().getTimestamp())) { - LOG.warn("Metadata Table will need to be re-bootstrapped as un-synced instants have been archived." - + " latestMetadataInstant=" + latestMetadataInstant.get().getTimestamp() - + ", latestDatasetInstant=" + datasetMetaClient.getActiveTimeline().firstInstant().get().getTimestamp()); - rebootstrap = true; - } + final Option latestMetadataInstant = + metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant(); + + rebootstrap = isBootstrapNeeded(latestMetadataInstant, actionMetadata); } if (rebootstrap) { metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.REBOOTSTRAP_STR, 1)); LOG.info("Deleting Metadata Table directory so that it can be re-bootstrapped"); - datasetMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath()), true); + dataMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath()), true); exists = false; } if (!exists) { // Initialize for the first time by listing partitions and files directly from the file system - if (bootstrapFromFilesystem(engineContext, datasetMetaClient)) { + if (bootstrapFromFilesystem(engineContext, dataMetaClient)) { metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer())); } } } + /** + * Whether bootstrap operation needed for this metadata table. + *

+ * Rollback of the first commit would look like un-synced instants in the metadata table. + * Action metadata is needed to verify the instant time and avoid erroneous bootstrapping. + *

+ * TODO: Revisit this logic and validate that filtering for all + * commits timeline is the right thing to do + * + * @return True if the bootstrap is not needed, False otherwise + */ + private boolean isBootstrapNeeded(Option latestMetadataInstant, + Option actionMetadata) { + if (!latestMetadataInstant.isPresent()) { + LOG.warn("Metadata Table will need to be re-bootstrapped as no instants were found"); + return true; + } + + final String latestMetadataInstantTimestamp = latestMetadataInstant.get().getTimestamp(); + if (latestMetadataInstantTimestamp.equals(SOLO_COMMIT_TIMESTAMP)) { + return false; + } + + boolean isRollbackAction = false; + List rollbackedTimestamps = Collections.emptyList(); + if (actionMetadata.isPresent() && actionMetadata.get() instanceof HoodieRollbackMetadata) { + isRollbackAction = true; + List rollbackedInstants = + ((HoodieRollbackMetadata) actionMetadata.get()).getInstantsRollback(); + rollbackedTimestamps = rollbackedInstants.stream().map(instant -> { + return instant.getCommitTime().toString(); + }).collect(Collectors.toList()); + } + + if (dataMetaClient.getActiveTimeline().getAllCommitsTimeline().isBeforeTimelineStarts( + latestMetadataInstant.get().getTimestamp()) + && (!isRollbackAction || !rollbackedTimestamps.contains(latestMetadataInstantTimestamp))) { + LOG.warn("Metadata Table will need to be re-bootstrapped as un-synced instants have been archived." + + " latestMetadataInstant=" + latestMetadataInstant.get().getTimestamp() + + ", latestDataInstant=" + dataMetaClient.getActiveTimeline().firstInstant().get().getTimestamp()); + return true; + } + + return false; + } + /** * Initialize the Metadata Table by listing files and partitions from the file system. * - * @param datasetMetaClient {@code HoodieTableMetaClient} for the dataset + * @param dataMetaClient {@code HoodieTableMetaClient} for the dataset. */ - private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient) throws IOException { + private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, HoodieTableMetaClient dataMetaClient) throws IOException { ValidationUtils.checkState(enabled, "Metadata table cannot be initialized as it is not enabled"); // We can only bootstrap if there are no pending operations on the dataset - Option pendingInstantOption = Option.fromJavaOptional(datasetMetaClient.getActiveTimeline() + Option pendingDataInstant = Option.fromJavaOptional(dataMetaClient.getActiveTimeline() .getReverseOrderedInstants().filter(i -> !i.isCompleted()).findFirst()); - if (pendingInstantOption.isPresent()) { + if (pendingDataInstant.isPresent()) { metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1)); - LOG.warn("Cannot bootstrap metadata table as operation is in progress: " + pendingInstantOption.get()); + LOG.warn("Cannot bootstrap metadata table as operation is in progress in dataset: " + pendingDataInstant.get()); return false; } // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit // Otherwise, we use the latest commit timestamp. - String createInstantTime = datasetMetaClient.getActiveTimeline().getReverseOrderedInstants().findFirst() + String createInstantTime = dataMetaClient.getActiveTimeline().getReverseOrderedInstants().findFirst() .map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); LOG.info("Creating a new metadata table in " + metadataWriteConfig.getBasePath() + " at instant " + createInstantTime); @@ -307,10 +375,11 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi .initTable(hadoopConf.get(), metadataWriteConfig.getBasePath()); initTableMetadata(); + initializeFileGroups(dataMetaClient, MetadataPartitionType.FILES, createInstantTime, 1); // List all partitions in the basePath of the containing dataset - LOG.info("Initializing metadata table by using file listings in " + datasetWriteConfig.getBasePath()); - Map> partitionToFileStatus = getPartitionsToFilesMapping(datasetMetaClient); + LOG.info("Initializing metadata table by using file listings in " + dataWriteConfig.getBasePath()); + Map> partitionToFileStatus = getPartitionsToFilesMapping(dataMetaClient); // Create a HoodieCommitMetadata with writeStats for all discovered files int[] stats = {0}; @@ -340,24 +409,24 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi }); LOG.info("Committing " + partitionToFileStatus.size() + " partitions and " + stats[0] + " files to metadata"); - update(commitMetadata, createInstantTime); + update(commitMetadata, createInstantTime, false); return true; } /** * Function to find hoodie partitions and list files in them in parallel. * - * @param datasetMetaClient + * @param dataMetaClient * @return Map of partition names to a list of FileStatus for all the files in the partition */ - private Map> getPartitionsToFilesMapping(HoodieTableMetaClient datasetMetaClient) { + private Map> getPartitionsToFilesMapping(HoodieTableMetaClient dataMetaClient) { List pathsToList = new LinkedList<>(); - pathsToList.add(new Path(datasetWriteConfig.getBasePath())); + pathsToList.add(new Path(dataWriteConfig.getBasePath())); Map> partitionToFileStatus = new HashMap<>(); final int fileListingParallelism = metadataWriteConfig.getFileListingParallelism(); - SerializableConfiguration conf = new SerializableConfiguration(datasetMetaClient.getHadoopConf()); - final String dirFilterRegex = datasetWriteConfig.getMetadataConfig().getDirectoryFilterRegex(); + SerializableConfiguration conf = new SerializableConfiguration(dataMetaClient.getHadoopConf()); + final String dirFilterRegex = dataWriteConfig.getMetadataConfig().getDirectoryFilterRegex(); while (!pathsToList.isEmpty()) { int listingParallelism = Math.min(fileListingParallelism, pathsToList.size()); @@ -381,7 +450,7 @@ private Map> getPartitionsToFilesMapping(HoodieTableMet .collect(Collectors.toList()); if (p.getRight().length > filesInDir.size()) { - String partitionName = FSUtils.getRelativePartitionPath(new Path(datasetMetaClient.getBasePath()), p.getLeft()); + String partitionName = FSUtils.getRelativePartitionPath(new Path(dataMetaClient.getBasePath()), p.getLeft()); // deal with Non-partition table, we should exclude .hoodie partitionToFileStatus.put(partitionName, filesInDir.stream() .filter(f -> !f.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)).collect(Collectors.toList())); @@ -399,64 +468,79 @@ private Map> getPartitionsToFilesMapping(HoodieTableMet } /** - * Sync the Metadata Table from the instants created on the dataset. + * Initialize file groups for a partition. For file listing, we just have one file group. * - * @param datasetMetaClient {@code HoodieTableMetaClient} for the dataset + * All FileGroups for a given metadata partition has a fixed prefix as per the {@link MetadataPartitionType#getFileIdPrefix()}. + * Each file group is suffixed with 4 digits with increments of 1 starting with 0000. + * + * Lets say we configure 10 file groups for record level index partittion, and prefix as "record-index-bucket-" + * File groups will be named as : + * record-index-bucket-0000, .... -> ..., record-index-bucket-0009 */ - private void syncFromInstants(HoodieTableMetaClient datasetMetaClient) { - ValidationUtils.checkState(enabled, "Metadata table cannot be synced as it is not enabled"); - // (re) init the metadata for reading. - initTableMetadata(); - try { - List instantsToSync = metadata.findInstantsToSyncForWriter(); - if (instantsToSync.isEmpty()) { - return; + private void initializeFileGroups(HoodieTableMetaClient dataMetaClient, MetadataPartitionType metadataPartition, String instantTime, + int fileGroupCount) throws IOException { + + final HashMap blockHeader = new HashMap<>(); + blockHeader.put(HeaderMetadataType.INSTANT_TIME, instantTime); + // Archival of data table has a dependency on compaction(base files) in metadata table. + // It is assumed that as of time Tx of base instant (/compaction time) in metadata table, + // all commits in data table is in sync with metadata table. So, we always start with log file for any fileGroup. + final HoodieDeleteBlock block = new HoodieDeleteBlock(new HoodieKey[0], blockHeader); + + LOG.info(String.format("Creating %d file groups for partition %s with base fileId %s at instant time %s", + fileGroupCount, metadataPartition.partitionPath(), metadataPartition.getFileIdPrefix(), instantTime)); + for (int i = 0; i < fileGroupCount; ++i) { + final String fileGroupFileId = String.format("%s%04d", metadataPartition.getFileIdPrefix(), i); + try { + HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), metadataPartition.partitionPath())) + .withFileId(fileGroupFileId).overBaseCommit(instantTime) + .withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION) + .withFileSize(0L) + .withSizeThreshold(metadataWriteConfig.getLogFileMaxSize()) + .withFs(dataMetaClient.getFs()) + .withRolloverLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN) + .withLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + writer.appendBlock(block); + writer.close(); + } catch (InterruptedException e) { + throw new HoodieException("Failed to created fileGroup " + fileGroupFileId + " for partition " + metadataPartition.partitionPath(), e); } - - LOG.info("Syncing " + instantsToSync.size() + " instants to metadata table: " + instantsToSync); - - // Read each instant in order and sync it to metadata table - for (HoodieInstant instant : instantsToSync) { - LOG.info("Syncing instant " + instant + " to metadata table"); - - Option> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, - metaClient.getActiveTimeline(), instant, metadata.getUpdateTime()); - if (records.isPresent()) { - commit(records.get(), MetadataPartitionType.FILES.partitionPath(), instant.getTimestamp()); - } - } - initTableMetadata(); - } catch (IOException ioe) { - throw new HoodieIOException("Unable to sync instants from data to metadata table.", ioe); } } /** - * Update from {@code HoodieCommitMetadata}. - * - * @param commitMetadata {@code HoodieCommitMetadata} - * @param instantTime Timestamp at which the commit was performed + * Interface to assist in converting commit metadata to List of HoodieRecords to be written to metadata table. + * Updates of different commit metadata uses the same method to convert to HoodieRecords and hence. */ - @Override - public void update(HoodieCommitMetadata commitMetadata, String instantTime) { - if (enabled) { - List records = HoodieTableMetadataUtil.convertMetadataToRecords(commitMetadata, instantTime); - commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime); + private interface ConvertMetadataFunction { + List convertMetadata(); + } + + /** + * Processes commit metadata from data table and commits to metadata table. + * @param instantTime instant time of interest. + * @param convertMetadataFunction converter function to convert the respective metadata to List of HoodieRecords to be written to metadata table. + * @param type of commit metadata. + * @param canTriggerTableService true if table services can be triggered. false otherwise. + */ + private void processAndCommit(String instantTime, ConvertMetadataFunction convertMetadataFunction, boolean canTriggerTableService) { + if (enabled && metadata != null) { + List records = convertMetadataFunction.convertMetadata(); + commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime, canTriggerTableService); } } /** - * Update from {@code HoodieCleanerPlan}. - * - * @param cleanerPlan {@code HoodieCleanerPlan} - * @param instantTime Timestamp at which the clean plan was generated + * Update from {@code HoodieCommitMetadata}. + * @param commitMetadata {@code HoodieCommitMetadata} + * @param instantTime Timestamp at which the commit was performed + * @param isTableServiceAction {@code true} if commit metadata is pertaining to a table service. {@code false} otherwise. */ @Override - public void update(HoodieCleanerPlan cleanerPlan, String instantTime) { - if (enabled) { - List records = HoodieTableMetadataUtil.convertMetadataToRecords(cleanerPlan, instantTime); - commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime); - } + public void update(HoodieCommitMetadata commitMetadata, String instantTime, boolean isTableServiceAction) { + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(commitMetadata, instantTime), !isTableServiceAction); } /** @@ -467,10 +551,8 @@ public void update(HoodieCleanerPlan cleanerPlan, String instantTime) { */ @Override public void update(HoodieCleanMetadata cleanMetadata, String instantTime) { - if (enabled) { - List records = HoodieTableMetadataUtil.convertMetadataToRecords(cleanMetadata, instantTime); - commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime); - } + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(cleanMetadata, instantTime), + false); } /** @@ -481,11 +563,8 @@ public void update(HoodieCleanMetadata cleanMetadata, String instantTime) { */ @Override public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) { - if (enabled) { - List records = HoodieTableMetadataUtil.convertMetadataToRecords(metaClient.getActiveTimeline(), - restoreMetadata, instantTime, metadata.getUpdateTime()); - commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime); - } + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(metadataMetaClient.getActiveTimeline(), + restoreMetadata, instantTime, metadata.getSyncedInstantTime()), false); } /** @@ -496,10 +575,22 @@ public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) { */ @Override public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) { - if (enabled) { - List records = HoodieTableMetadataUtil.convertMetadataToRecords(metaClient.getActiveTimeline(), - rollbackMetadata, instantTime, metadata.getUpdateTime()); - commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime); + if (enabled && metadata != null) { + // Is this rollback of an instant that has been synced to the metadata table? + String rollbackInstant = rollbackMetadata.getCommitsRollback().get(0); + boolean wasSynced = metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, rollbackInstant)); + if (!wasSynced) { + // A compaction may have taken place on metadata table which would have included this instant being rolled back. + // Revisit this logic to relax the compaction fencing : https://issues.apache.org/jira/browse/HUDI-2458 + Option latestCompaction = metadata.getLatestCompactionTime(); + if (latestCompaction.isPresent()) { + wasSynced = HoodieTimeline.compareTimestamps(rollbackInstant, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCompaction.get()); + } + } + + List records = HoodieTableMetadataUtil.convertMetadataToRecords(metadataMetaClient.getActiveTimeline(), rollbackMetadata, instantTime, + metadata.getSyncedInstantTime(), wasSynced); + commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime, false); } } @@ -510,13 +601,50 @@ public void close() throws Exception { } } - public HoodieBackedTableMetadata getMetadataReader() { - return metadata; - } - /** * Commit the {@code HoodieRecord}s to Metadata Table as a new delta-commit. + * @param records The list of records to be written. + * @param partitionName The partition to which the records are to be written. + * @param instantTime The timestamp to use for the deltacommit. + * @param canTriggerTableService true if table services can be scheduled and executed. false otherwise. + */ + protected abstract void commit(List records, String partitionName, String instantTime, boolean canTriggerTableService); + + /** + * Perform a compaction on the Metadata Table. + * + * Cases to be handled: + * 1. We cannot perform compaction if there are previous inflight operations on the dataset. This is because + * a compacted metadata base file at time Tx should represent all the actions on the dataset till time Tx. * + * 2. In multi-writer scenario, a parallel operation with a greater instantTime may have completed creating a + * deltacommit. */ - protected abstract void commit(List records, String partitionName, String instantTime); + protected void compactIfNecessary(AbstractHoodieWriteClient writeClient, String instantTime) { + String latestDeltacommitTime = metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant() + .get().getTimestamp(); + List pendingInstants = dataMetaClient.reloadActiveTimeline().filterInflightsAndRequested() + .findInstantsBefore(latestDeltacommitTime).getInstants().collect(Collectors.toList()); + + if (!pendingInstants.isEmpty()) { + LOG.info(String.format("Cannot compact metadata table as there are %d inflight instants before latest deltacommit %s: %s", + pendingInstants.size(), latestDeltacommitTime, Arrays.toString(pendingInstants.toArray()))); + return; + } + + // Trigger compaction with suffixes based on the same instant time. This ensures that any future + // delta commits synced over will not have an instant time lesser than the last completed instant on the + // metadata table. + final String compactionInstantTime = latestDeltacommitTime + "001"; + if (writeClient.scheduleCompactionAtInstant(compactionInstantTime, Option.empty())) { + writeClient.compact(compactionInstantTime); + } + } + + protected void doClean(AbstractHoodieWriteClient writeClient, String instantTime) { + // Trigger cleaning with suffixes based on the same instant time. This ensures that any future + // delta commits synced over will not have an instant time lesser than the last completed instant on the + // metadata table. + writeClient.clean(instantTime + "002"); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java index 02c5b9e646ad0..4f5ac027c91eb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java @@ -19,7 +19,6 @@ package org.apache.hudi.metadata; import org.apache.hudi.avro.model.HoodieCleanMetadata; -import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -31,13 +30,34 @@ */ public interface HoodieTableMetadataWriter extends Serializable, AutoCloseable { - void update(HoodieCommitMetadata commitMetadata, String instantTime); - - void update(HoodieCleanerPlan cleanerPlan, String instantTime); - + /** + * Update the metadata table due to a COMMIT operation. + * @param commitMetadata commit metadata of the operation of interest. + * @param instantTime instant time of the commit. + * @param isTableServiceAction true if caller is a table service. false otherwise. Only regular write operations can trigger metadata table services and this argument + * will assist in this. + */ + void update(HoodieCommitMetadata commitMetadata, String instantTime, boolean isTableServiceAction); + + /** + * Update the metadata table due to a CLEAN operation. + * @param cleanMetadata clean metadata of the operation of interest. + * @param instantTime instant time of the commit. + */ void update(HoodieCleanMetadata cleanMetadata, String instantTime); + /** + * Update the metadata table due to a RESTORE operation. + * @param restoreMetadata restore metadata of the operation of interest. + * @param instantTime instant time of the commit. + */ void update(HoodieRestoreMetadata restoreMetadata, String instantTime); + /** + * Update the metadata table due to a ROLLBACK operation. + * @param rollbackMetadata rollback metadata of the operation of interest. + * @param instantTime instant time of the commit. + */ void update(HoodieRollbackMetadata rollbackMetadata, String instantTime); + } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java index 9855ac0b0272d..c6dff8fd869ec 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java @@ -42,6 +42,7 @@ public class MetricsGraphiteReporter extends MetricsReporter { private final HoodieWriteConfig config; private String serverHost; private int serverPort; + private final int periodSeconds; public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) { this.registry = registry; @@ -56,12 +57,13 @@ public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry } this.graphiteReporter = createGraphiteReport(); + this.periodSeconds = config.getGraphiteReportPeriodSeconds(); } @Override public void start() { if (graphiteReporter != null) { - graphiteReporter.start(30, TimeUnit.SECONDS); + graphiteReporter.start(periodSeconds, TimeUnit.SECONDS); } else { LOG.error("Cannot start as the graphiteReporter is null."); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java new file mode 100644 index 0000000000000..3aa808075d330 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/ZOrderingUtil.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.optimize; + +import java.nio.charset.Charset; + +public class ZOrderingUtil { + + /** + * Lexicographically compare two arrays. + * copy from hbase + * @param buffer1 left operand + * @param buffer2 right operand + * @param offset1 Where to start comparing in the left buffer + * @param offset2 Where to start comparing in the right buffer + * @param length1 How much to compare from the left buffer + * @param length2 How much to compare from the right buffer + * @return 0 if equal, < 0 if left is less than right, etc. + */ + public static int compareTo(byte[] buffer1, int offset1, int length1, + byte[] buffer2, int offset2, int length2) { + // Short circuit equal case + if (buffer1 == buffer2 + && offset1 == offset2 + && length1 == length2) { + return 0; + } + // Bring WritableComparator code local + int end1 = offset1 + length1; + int end2 = offset2 + length2; + for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) { + int a = (buffer1[i] & 0xff); + int b = (buffer2[j] & 0xff); + if (a != b) { + return a - b; + } + } + return length1 - length2; + } + + public static byte[] paddingTo8Byte(byte[] a) { + if (a.length == 8) { + return a; + } + if (a.length > 8) { + byte[] result = new byte[8]; + System.arraycopy(a, 0, result, 0, 8); + return result; + } + int paddingSize = 8 - a.length; + byte[] result = new byte[8]; + for (int i = 0; i < paddingSize; i++) { + result[i] = 0; + } + System.arraycopy(a, 0, result, paddingSize, a.length); + + return result; + } + + /** + * Interleaving array bytes. + * Interleaving means take one bit from the first matrix element, one bit + * from the next, etc, then take the second bit from the first matrix + * element, second bit from the second, all the way to the last bit of the + * last element. Combine those bits in that order into a single BigInteger, + * @param buffer candidate element to do interleaving + * @return byte size of candidate element + */ + public static byte[] interleaving(byte[][] buffer, int size) { + int candidateSize = buffer.length; + byte[] result = new byte[size * candidateSize]; + int resBitPos = 0; + int totalBits = size * 8; + for (int bitStep = 0; bitStep < totalBits; bitStep++) { + int currentBytePos = (int) Math.floor(bitStep / 8); + int currentBitPos = bitStep % 8; + + for (int i = 0; i < candidateSize; i++) { + int tempResBytePos = (int) Math.floor(resBitPos / 8); + int tempResBitPos = resBitPos % 8; + result[tempResBytePos] = updatePos(result[tempResBytePos], tempResBitPos, buffer[i][currentBytePos], currentBitPos); + resBitPos++; + } + } + return result; + } + + public static byte updatePos(byte a, int apos, byte b, int bpos) { + byte temp = (byte) (b & (1 << (7 - bpos))); + if (apos < bpos) { + temp = (byte) (temp << (bpos - apos)); + } + if (apos > bpos) { + temp = (byte) (temp >> (apos - bpos)); + } + byte atemp = (byte) (a & (1 << (7 - apos))); + if ((byte) (atemp ^ temp) == 0) { + return a; + } + return (byte) (a ^ (1 << (7 - apos))); + } + + public static byte[] toBytes(int val) { + byte[] b = new byte[4]; + for (int i = 3; i > 0; i--) { + b[i] = (byte) val; + val >>>= 8; + } + b[0] = (byte) val; + return b; + } + + public static byte[] toBytes(long val) { + long temp = val; + byte[] b = new byte[8]; + for (int i = 7; i > 0; i--) { + b[i] = (byte) temp; + temp >>>= 8; + } + b[0] = (byte) temp; + return b; + } + + public static byte[] toBytes(final double d) { + return toBytes(Double.doubleToRawLongBits(d)); + } + + public static byte[] intTo8Byte(int a) { + int temp = a; + temp = temp ^ (1 << 31); + return paddingTo8Byte(toBytes(temp)); + } + + public static byte[] byteTo8Byte(byte a) { + return paddingTo8Byte(new byte[] { a }); + } + + public static byte[] longTo8Byte(long a) { + long temp = a; + temp = temp ^ (1L << 63); + return toBytes(temp); + } + + public static byte[] doubleTo8Byte(double a) { + byte[] temp = toBytes(a); + if (a > 0) { + temp[0] = (byte) (temp[0] ^ (1 << 7)); + } + if (a < 0) { + for (int i = 0; i < temp.length; i++) { + temp[i] = (byte) ~temp[i]; + } + } + return temp; + } + + public static byte[] utf8To8Byte(String a) { + return paddingTo8Byte(a.getBytes(Charset.forName("utf-8"))); + } + + public static Long convertStringToLong(String a) { + byte[] bytes = utf8To8Byte(a); + long temp = 0L; + for (int i = 7; i >= 0; i--) { + temp = temp | (((long)bytes[i] & 0xff) << (7 - i) * 8); + } + return temp; + } +} + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieCompactionHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieCompactionHandler.java new file mode 100644 index 0000000000000..eeb287abd543c --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieCompactionHandler.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * Interface for insert and update operations in compaction. + * + * @param HoodieRecordPayload type. + */ +public interface HoodieCompactionHandler { + Iterator> handleUpdate(String instantTime, String partitionPath, String fileId, + Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException; + + Iterator> handleInsert(String instantTime, String partitionPath, String fileId, + Map> recordMap); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index ad40c8ec73b85..6046374ba107d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -18,6 +18,7 @@ package org.apache.hudi.table; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; @@ -25,6 +26,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; @@ -62,6 +64,7 @@ import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.marker.WriteMarkers; @@ -98,7 +101,7 @@ public abstract class HoodieTable implem protected final HoodieWriteConfig config; protected final HoodieTableMetaClient metaClient; - protected final HoodieIndex index; + protected final HoodieIndex index; private SerializableConfiguration hadoopConfiguration; protected final TaskContextSupplier taskContextSupplier; private final HoodieTableMetadata metadata; @@ -122,7 +125,7 @@ protected HoodieTable(HoodieWriteConfig config, HoodieEngineContext context, Hoo this.taskContextSupplier = context.getTaskContextSupplier(); } - protected abstract HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context); + protected abstract HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context); private synchronized FileSystemViewManager getViewManager() { if (null == viewManager) { @@ -241,6 +244,16 @@ public abstract HoodieWriteMetadata bulkInsertPrepped(HoodieEngineContext con */ public abstract HoodieWriteMetadata insertOverwriteTable(HoodieEngineContext context, String instantTime, I records); + /** + * update statistics info for current table. + * to do adaptation, once RFC-27 is finished. + * + * @param context HoodieEngineContext + * @param instantTime Instant time for the replace action + * @param isOptimizeOperation whether current operation is OPTIMIZE type + */ + public abstract void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation); + public HoodieWriteConfig getConfig() { return config; } @@ -316,6 +329,13 @@ public HoodieTimeline getCleanTimeline() { return getActiveTimeline().getCleanerTimeline(); } + /** + * Get rollback timeline. + */ + public HoodieTimeline getRollbackTimeline() { + return getActiveTimeline().getRollbackTimeline(); + } + /** * Get only the completed (no-inflights) savepoint timeline. */ @@ -337,7 +357,7 @@ public HoodieActiveTimeline getActiveTimeline() { /** * Return the index. */ - public HoodieIndex getIndex() { + public HoodieIndex getIndex() { return index; } @@ -356,12 +376,11 @@ public abstract Option scheduleCompaction(HoodieEngineCont /** * Run Compaction on the table. Compaction arranges the data so that it is optimized for data access. * - * @param context HoodieEngineContext + * @param context HoodieEngineContext * @param compactionInstantTime Instant Time */ public abstract HoodieWriteMetadata compact(HoodieEngineContext context, - String compactionInstantTime); - + String compactionInstantTime); /** * Schedule clustering for the instant time. @@ -415,8 +434,21 @@ public abstract Option scheduleCleaning(HoodieEngineContext c * * @return information on cleaned file slices */ - public abstract HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime); + public abstract HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime, boolean skipLocking); + /** + * Schedule rollback for the instant time. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for scheduling rollback + * @param instantToRollback instant to be rolled back + * @return HoodieRollbackPlan containing info on rollback. + */ + public abstract Option scheduleRollback(HoodieEngineContext context, + String instantTime, + HoodieInstant instantToRollback, + boolean skipTimelinePublish); + /** * Rollback the (inflight/committed) record changes with the given commit time. *

@@ -430,7 +462,8 @@ public abstract Option scheduleCleaning(HoodieEngineContext c
   public abstract HoodieRollbackMetadata rollback(HoodieEngineContext context,
                                                   String rollbackInstantTime,
                                                   HoodieInstant commitInstant,
-                                                  boolean deleteInstants);
+                                                  boolean deleteInstants,
+                                                  boolean skipLocking);
 
   /**
    * Create a savepoint at the specified instant, so that the table can be restored
@@ -449,11 +482,24 @@ public abstract HoodieRestoreMetadata restore(HoodieEngineContext context,
                                                 String restoreInstantTime,
                                                 String instantToRestore);
 
+  /**
+   * Rollback failed compactions. Inflight rollbacks for compactions revert the .inflight file
+   * to the .requested file.
+   *
+   * @param inflightInstant Inflight Compaction Instant
+   */
+  public void rollbackInflightCompaction(HoodieInstant inflightInstant) {
+    String commitTime = HoodieActiveTimeline.createNewInstantTime();
+    scheduleRollback(context, commitTime, inflightInstant, false);
+    rollback(context, commitTime, inflightInstant, false, false);
+    getActiveTimeline().revertCompactionInflightToRequested(inflightInstant);
+  }
+
   /**
    * Finalize the written data onto storage. Perform any final cleanups.
    *
    * @param context HoodieEngineContext
-   * @param stats List of HoodieWriteStats
+   * @param stats   List of HoodieWriteStats
    * @throws HoodieIOException if some paths can't be finalized on storage
    */
   public void finalizeWrite(HoodieEngineContext context, String instantTs, List stats) throws HoodieIOException {
@@ -682,4 +728,32 @@ public HoodieEngineContext getContext() {
     // to engine context, and it ends up being null (as its not serializable and marked transient here).
     return context == null ? new HoodieLocalEngineContext(hadoopConfiguration.get()) : context;
   }
+
+  /**
+   * Get Table metadata writer.
+   *
+   * @return instance of {@link HoodieTableMetadataWriter
+   */
+  public final Option getMetadataWriter() {
+    return getMetadataWriter(Option.empty());
+  }
+
+  /**
+   * Check if action type is a table service.
+   * @param actionType action type of interest.
+   * @return true if action represents a table service. false otherwise.
+   */
+  public abstract boolean isTableServiceAction(String actionType);
+
+  /**
+   * Get Table metadata writer.
+   *
+   * @return instance of {@link HoodieTableMetadataWriter}
+   */
+  public  Option getMetadataWriter(Option actionMetadata) {
+    // Each engine is expected to override this and
+    // provide the actual metadata writer, if enabled.
+    return Option.empty();
+  }
+
 }
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java
index 317512f766c99..d492fb6577a93 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java
@@ -200,20 +200,19 @@ private Stream getInstantsToArchive() {
         .collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(),
             HoodieInstant.getComparableAction(i.getAction()))));
 
-    // If metadata table is enabled, do not archive instants which are more recent that the latest synced
-    // instant on the metadata table. This is required for metadata table sync.
+    // If metadata table is enabled, do not archive instants which are more recent that the last compaction on the
+    // metadata table.
     if (config.isMetadataTableEnabled()) {
       try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(),
           config.getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) {
-        Option lastSyncedInstantTime = tableMetadata.getUpdateTime();
-
-        if (lastSyncedInstantTime.isPresent()) {
-          LOG.info("Limiting archiving of instants to last synced instant on metadata table at " + lastSyncedInstantTime.get());
-          instants = instants.filter(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), HoodieTimeline.LESSER_THAN,
-              lastSyncedInstantTime.get()));
-        } else {
-          LOG.info("Not archiving as there is no instants yet on the metadata table");
+        Option latestCompactionTime = tableMetadata.getLatestCompactionTime();
+        if (!latestCompactionTime.isPresent()) {
+          LOG.info("Not archiving as there is no compaction yet on the metadata table");
           instants = Stream.empty();
+        } else {
+          LOG.info("Limiting archiving of instants to latest compaction on metadata table at " + latestCompactionTime.get());
+          instants = instants.filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN,
+              latestCompactionTime.get()));
         }
       } catch (Exception e) {
         throw new HoodieException("Error limiting instant archival based on metadata table", e);
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java
index 6c776cfb077ae..a22479b6bf341 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java
@@ -21,8 +21,13 @@
 import java.io.Serializable;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hudi.avro.model.HoodieCleanMetadata;
+import org.apache.hudi.avro.model.HoodieRestoreMetadata;
+import org.apache.hudi.avro.model.HoodieRollbackMetadata;
 import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
 import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.util.Option;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.table.HoodieTable;
 
@@ -46,4 +51,36 @@ public BaseActionExecutor(HoodieEngineContext context, HoodieWriteConfig config,
   }
 
   public abstract R execute();
+
+  /**
+   * Writes commits metadata to table metadata.
+   * @param metadata commit metadata of interest.
+   */
+  protected final void writeTableMetadata(HoodieCommitMetadata metadata, String actionType) {
+    table.getMetadataWriter().ifPresent(w -> w.update(metadata, instantTime, table.isTableServiceAction(actionType)));
+  }
+
+  /**
+   * Writes clean metadata to table metadata.
+   * @param metadata clean metadata of interest.
+   */
+  protected final void writeTableMetadata(HoodieCleanMetadata metadata) {
+    table.getMetadataWriter().ifPresent(w -> w.update(metadata, instantTime));
+  }
+
+  /**
+   * Writes rollback metadata to table metadata.
+   * @param metadata rollback metadata of interest.
+   */
+  protected final void writeTableMetadata(HoodieRollbackMetadata metadata) {
+    table.getMetadataWriter(Option.of(metadata)).ifPresent(w -> w.update(metadata, instantTime));
+  }
+
+  /**
+   * Writes restore metadata to table metadata.
+   * @param metadata restore metadata of interest.
+   */
+  protected final void writeTableMetadata(HoodieRestoreMetadata metadata) {
+    table.getMetadataWriter().ifPresent(w -> w.update(metadata, instantTime));
+  }
 }
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java
index 5ef204f9706db..d771a574e37e5 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java
@@ -46,6 +46,36 @@ public class HoodieWriteMetadata {
   public HoodieWriteMetadata() {
   }
 
+  /**
+   * Clones the write metadata with transformed write statuses.
+   *
+   * @param transformedWriteStatuses transformed write statuses
+   * @param                       type of transformed write statuses
+   * @return Cloned {@link HoodieWriteMetadata} instance
+   */
+  public  HoodieWriteMetadata clone(T transformedWriteStatuses) {
+    HoodieWriteMetadata newMetadataInstance = new HoodieWriteMetadata<>();
+    newMetadataInstance.setWriteStatuses(transformedWriteStatuses);
+    if (indexLookupDuration.isPresent()) {
+      newMetadataInstance.setIndexLookupDuration(indexLookupDuration.get());
+    }
+    newMetadataInstance.setCommitted(isCommitted);
+    newMetadataInstance.setCommitMetadata(commitMetadata);
+    if (writeStats.isPresent()) {
+      newMetadataInstance.setWriteStats(writeStats.get());
+    }
+    if (indexUpdateDuration.isPresent()) {
+      newMetadataInstance.setIndexUpdateDuration(indexUpdateDuration.get());
+    }
+    if (finalizeDuration.isPresent()) {
+      newMetadataInstance.setFinalizeDuration(finalizeDuration.get());
+    }
+    if (partitionToReplaceFileIds.isPresent()) {
+      newMetadataInstance.setPartitionToReplaceFileIds(partitionToReplaceFileIds.get());
+    }
+    return newMetadataInstance;
+  }
+
   public O getWriteStatuses() {
     return writeStatuses;
   }
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanActionExecutor.java
deleted file mode 100644
index acc3cdc6793ab..0000000000000
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanActionExecutor.java
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.table.action.clean;
-
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hudi.avro.model.HoodieCleanMetadata;
-import org.apache.hudi.avro.model.HoodieCleanerPlan;
-import org.apache.hudi.common.HoodieCleanStat;
-import org.apache.hudi.common.engine.HoodieEngineContext;
-import org.apache.hudi.common.model.HoodieRecordPayload;
-import org.apache.hudi.common.table.timeline.HoodieInstant;
-import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
-import org.apache.hudi.common.util.CleanerUtils;
-import org.apache.hudi.common.util.HoodieTimer;
-import org.apache.hudi.common.util.Option;
-import org.apache.hudi.common.util.ValidationUtils;
-import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.hudi.exception.HoodieIOException;
-import org.apache.hudi.table.HoodieTable;
-import org.apache.hudi.table.action.BaseActionExecutor;
-import org.apache.log4j.LogManager;
-import org.apache.log4j.Logger;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.stream.Collectors;
-
-public abstract class BaseCleanActionExecutor extends BaseActionExecutor {
-
-  private static final long serialVersionUID = 1L;
-  private static final Logger LOG = LogManager.getLogger(BaseCleanActionExecutor.class);
-
-  public BaseCleanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime) {
-    super(context, config, table, instantTime);
-  }
-
-  protected static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException {
-    Path deletePath = new Path(deletePathStr);
-    LOG.debug("Working on delete path :" + deletePath);
-    try {
-      boolean deleteResult = fs.delete(deletePath, false);
-      if (deleteResult) {
-        LOG.debug("Cleaned file at path :" + deletePath);
-      }
-      return deleteResult;
-    } catch (FileNotFoundException fio) {
-      // With cleanPlan being used for retried cleaning operations, its possible to clean a file twice
-      return false;
-    }
-  }
-
-  /**
-   * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
-   * skews in partitions to clean by making files to clean as the unit of task distribution.
-   *
-   * @throws IllegalArgumentException if unknown cleaning policy is provided
-   */
-  abstract List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan);
-
-  /**
-   * Executes the Cleaner plan stored in the instant metadata.
-   */
-  HoodieCleanMetadata runPendingClean(HoodieTable table, HoodieInstant cleanInstant) {
-    try {
-      HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(table.getMetaClient(), cleanInstant);
-      return runClean(table, cleanInstant, cleanerPlan);
-    } catch (IOException e) {
-      throw new HoodieIOException(e.getMessage(), e);
-    }
-  }
-
-  private HoodieCleanMetadata runClean(HoodieTable table, HoodieInstant cleanInstant, HoodieCleanerPlan cleanerPlan) {
-    ValidationUtils.checkArgument(cleanInstant.getState().equals(HoodieInstant.State.REQUESTED)
-        || cleanInstant.getState().equals(HoodieInstant.State.INFLIGHT));
-
-    try {
-      final HoodieInstant inflightInstant;
-      final HoodieTimer timer = new HoodieTimer();
-      timer.startTimer();
-      if (cleanInstant.isRequested()) {
-        inflightInstant = table.getActiveTimeline().transitionCleanRequestedToInflight(cleanInstant,
-            TimelineMetadataUtils.serializeCleanerPlan(cleanerPlan));
-      } else {
-        inflightInstant = cleanInstant;
-      }
-
-      List cleanStats = clean(context, cleanerPlan);
-      if (cleanStats.isEmpty()) {
-        return HoodieCleanMetadata.newBuilder().build();
-      }
-
-      table.getMetaClient().reloadActiveTimeline();
-      HoodieCleanMetadata metadata = CleanerUtils.convertCleanMetadata(
-          inflightInstant.getTimestamp(),
-          Option.of(timer.endTimer()),
-          cleanStats
-      );
-
-      table.getActiveTimeline().transitionCleanInflightToComplete(inflightInstant,
-          TimelineMetadataUtils.serializeCleanMetadata(metadata));
-      LOG.info("Marked clean started on " + inflightInstant.getTimestamp() + " as complete");
-      return metadata;
-    } catch (IOException e) {
-      throw new HoodieIOException("Failed to clean up after commit", e);
-    }
-  }
-
-  @Override
-  public HoodieCleanMetadata execute() {
-    List cleanMetadataList = new ArrayList<>();
-    // If there are inflight(failed) or previously requested clean operation, first perform them
-    List pendingCleanInstants = table.getCleanTimeline()
-        .filterInflightsAndRequested().getInstants().collect(Collectors.toList());
-    if (pendingCleanInstants.size() > 0) {
-      pendingCleanInstants.forEach(hoodieInstant -> {
-        LOG.info("Finishing previously unfinished cleaner instant=" + hoodieInstant);
-        try {
-          cleanMetadataList.add(runPendingClean(table, hoodieInstant));
-        } catch (Exception e) {
-          LOG.warn("Failed to perform previous clean operation, instant: " + hoodieInstant, e);
-        }
-      });
-      table.getMetaClient().reloadActiveTimeline();
-    }
-    // return the last clean metadata for now
-    // TODO (NA) : Clean only the earliest pending clean just like how we do for other table services
-    // This requires the BaseCleanActionExecutor to be refactored as BaseCommitActionExecutor
-    return cleanMetadataList.size() > 0 ? cleanMetadataList.get(cleanMetadataList.size() - 1) : null;
-  }
-}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
new file mode 100644
index 0000000000000..a445fd3cc0907
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.clean;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.hudi.avro.model.HoodieActionInstant;
+import org.apache.hudi.avro.model.HoodieCleanMetadata;
+import org.apache.hudi.avro.model.HoodieCleanerPlan;
+import org.apache.hudi.client.transaction.TransactionManager;
+import org.apache.hudi.common.HoodieCleanStat;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.CleanFileInfo;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
+import org.apache.hudi.common.util.CleanerUtils;
+import org.apache.hudi.common.util.HoodieTimer;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.common.util.collection.ImmutablePair;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.action.BaseActionExecutor;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class CleanActionExecutor extends BaseActionExecutor {
+
+  private static final long serialVersionUID = 1L;
+  private static final Logger LOG = LogManager.getLogger(CleanActionExecutor.class);
+  private final TransactionManager txnManager;
+  private final boolean skipLocking;
+
+  public CleanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime) {
+    this(context, config, table, instantTime, false);
+  }
+
+  public CleanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, boolean skipLocking) {
+    super(context, config, table, instantTime);
+    this.txnManager = new TransactionManager(config, table.getMetaClient().getFs());
+    this.skipLocking = skipLocking;
+  }
+
+  static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException {
+    Path deletePath = new Path(deletePathStr);
+    LOG.debug("Working on delete path :" + deletePath);
+    try {
+      boolean deleteResult = fs.delete(deletePath, false);
+      if (deleteResult) {
+        LOG.debug("Cleaned file at path :" + deletePath);
+      }
+      return deleteResult;
+    } catch (FileNotFoundException fio) {
+      // With cleanPlan being used for retried cleaning operations, its possible to clean a file twice
+      return false;
+    }
+  }
+
+  static Stream> deleteFilesFunc(Iterator> cleanFileInfo, HoodieTable table) {
+    Map partitionCleanStatMap = new HashMap<>();
+    FileSystem fs = table.getMetaClient().getFs();
+
+    cleanFileInfo.forEachRemaining(partitionDelFileTuple -> {
+      String partitionPath = partitionDelFileTuple.getLeft();
+      Path deletePath = new Path(partitionDelFileTuple.getRight().getFilePath());
+      String deletePathStr = deletePath.toString();
+      Boolean deletedFileResult = null;
+      try {
+        deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
+
+      } catch (IOException e) {
+        LOG.error("Delete file failed: " + deletePathStr);
+      }
+      final PartitionCleanStat partitionCleanStat =
+          partitionCleanStatMap.computeIfAbsent(partitionPath, k -> new PartitionCleanStat(partitionPath));
+      boolean isBootstrapBasePathFile = partitionDelFileTuple.getRight().isBootstrapBaseFile();
+
+      if (isBootstrapBasePathFile) {
+        // For Bootstrap Base file deletions, store the full file path.
+        partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true);
+        partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true);
+      } else {
+        partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false);
+        partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false);
+      }
+    });
+    return partitionCleanStatMap.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue()));
+  }
+
+  /**
+   * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
+   * skews in partitions to clean by making files to clean as the unit of task distribution.
+   *
+   * @throws IllegalArgumentException if unknown cleaning policy is provided
+   */
+  List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
+    int cleanerParallelism = Math.min(
+        (int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()),
+        config.getCleanerParallelism());
+    LOG.info("Using cleanerParallelism: " + cleanerParallelism);
+
+    context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
+
+    Stream> filesToBeDeletedPerPartition =
+        cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream()
+            .flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(),
+                new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))));
+
+    Stream> partitionCleanStats =
+        context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition,
+            iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism);
+
+    Map partitionCleanStatsMap = partitionCleanStats
+        .collect(Collectors.toMap(Pair::getKey, Pair::getValue));
+
+    // Return PartitionCleanStat for each partition passed.
+    return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
+      PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath)
+          ? partitionCleanStatsMap.get(partitionPath)
+          : new PartitionCleanStat(partitionPath);
+      HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
+      return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath)
+          .withEarliestCommitRetained(Option.ofNullable(
+              actionInstant != null
+                  ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()),
+                  actionInstant.getAction(), actionInstant.getTimestamp())
+                  : null))
+          .withDeletePathPattern(partitionCleanStat.deletePathPatterns())
+          .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles())
+          .withFailedDeletes(partitionCleanStat.failedDeleteFiles())
+          .withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns())
+          .withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles())
+          .withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles())
+          .build();
+    }).collect(Collectors.toList());
+  }
+
+
+  /**
+   * Executes the Cleaner plan stored in the instant metadata.
+   */
+  HoodieCleanMetadata runPendingClean(HoodieTable table, HoodieInstant cleanInstant) {
+    try {
+      HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(table.getMetaClient(), cleanInstant);
+      return runClean(table, cleanInstant, cleanerPlan);
+    } catch (IOException e) {
+      throw new HoodieIOException(e.getMessage(), e);
+    }
+  }
+
+  private HoodieCleanMetadata runClean(HoodieTable table, HoodieInstant cleanInstant, HoodieCleanerPlan cleanerPlan) {
+    ValidationUtils.checkArgument(cleanInstant.getState().equals(HoodieInstant.State.REQUESTED)
+        || cleanInstant.getState().equals(HoodieInstant.State.INFLIGHT));
+
+    try {
+      final HoodieInstant inflightInstant;
+      final HoodieTimer timer = new HoodieTimer();
+      timer.startTimer();
+      if (cleanInstant.isRequested()) {
+        inflightInstant = table.getActiveTimeline().transitionCleanRequestedToInflight(cleanInstant,
+            TimelineMetadataUtils.serializeCleanerPlan(cleanerPlan));
+      } else {
+        inflightInstant = cleanInstant;
+      }
+
+      List cleanStats = clean(context, cleanerPlan);
+      if (cleanStats.isEmpty()) {
+        return HoodieCleanMetadata.newBuilder().build();
+      }
+
+      table.getMetaClient().reloadActiveTimeline();
+      HoodieCleanMetadata metadata = CleanerUtils.convertCleanMetadata(
+          inflightInstant.getTimestamp(),
+          Option.of(timer.endTimer()),
+          cleanStats
+      );
+      writeMetadata(metadata);
+      table.getActiveTimeline().transitionCleanInflightToComplete(inflightInstant,
+          TimelineMetadataUtils.serializeCleanMetadata(metadata));
+      LOG.info("Marked clean started on " + inflightInstant.getTimestamp() + " as complete");
+      return metadata;
+    } catch (IOException e) {
+      throw new HoodieIOException("Failed to clean up after commit", e);
+    }
+  }
+
+  /**
+   * Update metadata table if available. Any update to metadata table happens within data table lock.
+   * @param cleanMetadata instance of {@link HoodieCleanMetadata} to be applied to metadata.
+   */
+  private void writeMetadata(HoodieCleanMetadata cleanMetadata) {
+    if (config.isMetadataTableEnabled()) {
+      try {
+        if (!skipLocking) {
+          this.txnManager.beginTransaction(Option.empty(), Option.empty());
+        }
+        writeTableMetadata(cleanMetadata);
+      } finally {
+        if (!skipLocking) {
+          this.txnManager.endTransaction();
+        }
+      }
+    }
+  }
+
+  @Override
+  public HoodieCleanMetadata execute() {
+    List cleanMetadataList = new ArrayList<>();
+    // If there are inflight(failed) or previously requested clean operation, first perform them
+    List pendingCleanInstants = table.getCleanTimeline()
+        .filterInflightsAndRequested().getInstants().collect(Collectors.toList());
+    if (pendingCleanInstants.size() > 0) {
+      pendingCleanInstants.forEach(hoodieInstant -> {
+        LOG.info("Finishing previously unfinished cleaner instant=" + hoodieInstant);
+        try {
+          cleanMetadataList.add(runPendingClean(table, hoodieInstant));
+        } catch (Exception e) {
+          LOG.warn("Failed to perform previous clean operation, instant: " + hoodieInstant, e);
+        }
+      });
+      table.getMetaClient().reloadActiveTimeline();
+    }
+    // return the last clean metadata for now
+    // TODO (NA) : Clean only the earliest pending clean just like how we do for other table services
+    // This requires the CleanActionExecutor to be refactored as BaseCommitActionExecutor
+    return cleanMetadataList.size() > 0 ? cleanMetadataList.get(cleanMetadataList.size() - 1) : null;
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java
similarity index 90%
rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanPlanActionExecutor.java
rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java
index fc0c000a6a864..9b95bd718397e 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanPlanActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java
@@ -43,22 +43,24 @@
 import java.util.Map;
 import java.util.stream.Collectors;
 
-public abstract class BaseCleanPlanActionExecutor extends BaseActionExecutor> {
+public class CleanPlanActionExecutor extends BaseActionExecutor> {
 
   private static final Logger LOG = LogManager.getLogger(CleanPlanner.class);
 
   private final Option> extraMetadata;
 
-  public BaseCleanPlanActionExecutor(HoodieEngineContext context,
-                                     HoodieWriteConfig config,
-                                     HoodieTable table,
-                                     String instantTime,
-                                     Option> extraMetadata) {
+  public CleanPlanActionExecutor(HoodieEngineContext context,
+                                 HoodieWriteConfig config,
+                                 HoodieTable table,
+                                 String instantTime,
+                                 Option> extraMetadata) {
     super(context, config, table, instantTime);
     this.extraMetadata = extraMetadata;
   }
 
-  protected abstract Option createCleanerPlan();
+  protected Option createCleanerPlan() {
+    return execute();
+  }
 
   /**
    * Generates List of files to be cleaned.
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java
index 455952ae5f7d7..0d9cdc0aa062c 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java
@@ -63,11 +63,8 @@ public HoodieWriteMetadata write(String instantTime,
     }
   }
 
-  private I tag(
-      I dedupedRecords, HoodieEngineContext context, HoodieTable table) {
-    // perform index loop up to get existing location of records
-    return table.getIndex().tagLocation(dedupedRecords, context, table);
-  }
+  protected abstract I tag(
+      I dedupedRecords, HoodieEngineContext context, HoodieTable table);
 
   public I combineOnCondition(
       boolean condition, I records, int parallelism, HoodieTable table) {
@@ -87,5 +84,5 @@ public I deduplicateRecords(
   }
 
   public abstract I deduplicateRecords(
-      I records, HoodieIndex index, int parallelism);
+      I records, HoodieIndex index, int parallelism);
 }
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java
index 4b519ed92b4aa..ce6ed5db303c7 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java
@@ -175,10 +175,6 @@ protected void finalizeWrite(String instantTime, List stats, Ho
     }
   }
 
-  protected void syncTableMetadata() {
-    // No Op
-  }
-
   /**
    * By default, return the writer schema in Write Config for storing in commit.
    */
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/BaseScheduleCompactionActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/BaseScheduleCompactionActionExecutor.java
deleted file mode 100644
index 25c2fec866b57..0000000000000
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/BaseScheduleCompactionActionExecutor.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.table.action.compact;
-
-import org.apache.hudi.avro.model.HoodieCompactionPlan;
-import org.apache.hudi.common.engine.HoodieEngineContext;
-import org.apache.hudi.common.model.HoodieRecordPayload;
-import org.apache.hudi.common.table.timeline.HoodieInstant;
-import org.apache.hudi.common.table.timeline.HoodieTimeline;
-import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
-import org.apache.hudi.common.util.Option;
-import org.apache.hudi.common.util.ValidationUtils;
-import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.hudi.exception.HoodieIOException;
-import org.apache.hudi.table.HoodieTable;
-import org.apache.hudi.table.action.BaseActionExecutor;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-
-public abstract class BaseScheduleCompactionActionExecutor extends BaseActionExecutor> {
-
-  private final Option> extraMetadata;
-
-  public BaseScheduleCompactionActionExecutor(HoodieEngineContext context,
-                                              HoodieWriteConfig config,
-                                              HoodieTable table,
-                                              String instantTime,
-                                              Option> extraMetadata) {
-    super(context, config, table, instantTime);
-    this.extraMetadata = extraMetadata;
-  }
-
-  protected abstract HoodieCompactionPlan scheduleCompaction();
-
-  @Override
-  public Option execute() {
-    if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()
-        && !config.getFailedWritesCleanPolicy().isLazy()) {
-      // if there are inflight writes, their instantTime must not be less than that of compaction instant time
-      table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant()
-          .ifPresent(earliestInflight -> ValidationUtils.checkArgument(
-              HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime),
-              "Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight
-                  + ", Compaction scheduled at " + instantTime));
-      // Committed and pending compaction instants should have strictly lower timestamps
-      List conflictingInstants = table.getActiveTimeline()
-          .getWriteTimeline().filterCompletedAndCompactionInstants().getInstants()
-          .filter(instant -> HoodieTimeline.compareTimestamps(
-              instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime))
-          .collect(Collectors.toList());
-      ValidationUtils.checkArgument(conflictingInstants.isEmpty(),
-          "Following instants have timestamps >= compactionInstant (" + instantTime + ") Instants :"
-              + conflictingInstants);
-    }
-
-    HoodieCompactionPlan plan = scheduleCompaction();
-    if (plan != null && (plan.getOperations() != null) && (!plan.getOperations().isEmpty())) {
-      extraMetadata.ifPresent(plan::setExtraMetadata);
-      HoodieInstant compactionInstant =
-          new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime);
-      try {
-        table.getActiveTimeline().saveToCompactionRequested(compactionInstant,
-            TimelineMetadataUtils.serializeCompactionPlan(plan));
-      } catch (IOException ioe) {
-        throw new HoodieIOException("Exception scheduling compaction", ioe);
-      }
-      return Option.of(plan);
-    }
-    return Option.empty();
-  }
-}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/AbstractCompactHelpers.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactHelpers.java
similarity index 59%
rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/AbstractCompactHelpers.java
rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactHelpers.java
index 3ff9e625e8c7c..a348eb0ed3a76 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/AbstractCompactHelpers.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactHelpers.java
@@ -18,17 +18,23 @@
 
 package org.apache.hudi.table.action.compact;
 
+import org.apache.hudi.avro.model.HoodieCompactionPlan;
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.data.HoodieData;
 import org.apache.hudi.common.model.HoodieCommitMetadata;
 import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.model.HoodieWriteStat;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.exception.HoodieCompactionException;
 import org.apache.hudi.table.HoodieTable;
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.util.List;
 
 /**
  * Base class helps to perform compact.
@@ -38,11 +44,34 @@
  * @param  Type of keys
  * @param  Type of outputs
  */
-public abstract class AbstractCompactHelpers {
-  public abstract HoodieCommitMetadata createCompactionMetadata(HoodieTable table,
-                                                                String compactionInstantTime,
-                                                                O writeStatuses,
-                                                                String schema) throws IOException;
+public class CompactHelpers {
+
+  private static final CompactHelpers SINGLETON_INSTANCE = new CompactHelpers();
+
+  private CompactHelpers() {
+  }
+
+  public static CompactHelpers getInstance() {
+    return SINGLETON_INSTANCE;
+  }
+
+  public HoodieCommitMetadata createCompactionMetadata(
+      HoodieTable table, String compactionInstantTime, HoodieData writeStatuses,
+      String schema) throws IOException {
+    byte[] planBytes = table.getActiveTimeline().readCompactionPlanAsBytes(
+        HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get();
+    HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan(planBytes);
+    List updateStatusMap = writeStatuses.map(WriteStatus::getStat).collectAsList();
+    HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
+    for (HoodieWriteStat stat : updateStatusMap) {
+      metadata.addWriteStat(stat.getPartitionPath(), stat);
+    }
+    metadata.addMetadata(org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY, schema);
+    if (compactionPlan.getExtraMetadata() != null) {
+      compactionPlan.getExtraMetadata().forEach(metadata::addMetadata);
+    }
+    return metadata;
+  }
 
   public void completeInflightCompaction(HoodieTable table, String compactionCommitTime, HoodieCommitMetadata commitMetadata) {
     HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java
index c92c0b3a0237e..419f88eef4fd2 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java
@@ -18,39 +18,281 @@
 
 package org.apache.hudi.table.action.compact;
 
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.avro.model.HoodieCompactionOperation;
 import org.apache.hudi.avro.model.HoodieCompactionPlan;
+import org.apache.hudi.client.WriteStatus;
+import org.apache.hudi.common.data.HoodieAccumulator;
+import org.apache.hudi.common.data.HoodieData;
 import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.engine.TaskContextSupplier;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.CompactionOperation;
+import org.apache.hudi.common.model.HoodieBaseFile;
 import org.apache.hudi.common.model.HoodieFileGroupId;
+import org.apache.hudi.common.model.HoodieLogFile;
 import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
+import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.view.TableFileSystemView.SliceView;
+import org.apache.hudi.common.util.CollectionUtils;
+import org.apache.hudi.common.util.CompactionUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.io.IOUtils;
+import org.apache.hudi.table.HoodieCompactionHandler;
 import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.action.compact.strategy.CompactionStrategy;
+
+import org.apache.avro.Schema;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
 
 import java.io.IOException;
 import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
 import java.util.Set;
+import java.util.stream.StreamSupport;
+
+import static java.util.stream.Collectors.toList;
 
 /**
  * A HoodieCompactor runs compaction on a hoodie table.
  */
-public interface HoodieCompactor extends Serializable {
+public abstract class HoodieCompactor implements Serializable {
+
+  private static final Logger LOG = LogManager.getLogger(HoodieCompactor.class);
 
   /**
-   * Generate a new compaction plan for scheduling.
+   * Handles the compaction timeline based on the compaction instant before actual compaction.
    *
-   * @param context HoodieEngineContext
-   * @param hoodieTable Hoodie Table
-   * @param config Hoodie Write Configuration
-   * @param compactionCommitTime scheduled compaction commit time
-   * @param fgIdsInPendingCompactions partition-fileId pairs for which compaction is pending
-   * @return Compaction Plan
-   * @throws IOException when encountering errors
+   * @param table                     {@link HoodieTable} instance to use.
+   * @param pendingCompactionTimeline pending compaction timeline.
+   * @param compactionInstantTime     compaction instant
+   */
+  public abstract void preCompact(
+      HoodieTable table, HoodieTimeline pendingCompactionTimeline, String compactionInstantTime);
+
+  /**
+   * Maybe persist write status.
+   *
+   * @param writeStatus {@link HoodieData} of {@link WriteStatus}.
    */
-  HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, HoodieTable hoodieTable, HoodieWriteConfig config,
-                                              String compactionCommitTime, Set fgIdsInPendingCompactions) throws IOException;
+  public abstract void maybePersist(HoodieData writeStatus, HoodieWriteConfig config);
 
   /**
    * Execute compaction operations and report back status.
    */
-  O compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable,
-      HoodieWriteConfig config, String compactionInstantTime) throws IOException;
+  public HoodieData compact(
+      HoodieEngineContext context, HoodieCompactionPlan compactionPlan,
+      HoodieTable table, HoodieWriteConfig config, String compactionInstantTime,
+      HoodieCompactionHandler compactionHandler) {
+    if (compactionPlan == null || (compactionPlan.getOperations() == null)
+        || (compactionPlan.getOperations().isEmpty())) {
+      return context.emptyHoodieData();
+    }
+    HoodieActiveTimeline timeline = table.getActiveTimeline();
+    HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
+    // Mark instant as compaction inflight
+    timeline.transitionCompactionRequestedToInflight(instant);
+    table.getMetaClient().reloadActiveTimeline();
+
+    HoodieTableMetaClient metaClient = table.getMetaClient();
+    TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient);
+
+    // Here we firstly use the table schema as the reader schema to read
+    // log file.That is because in the case of MergeInto, the config.getSchema may not
+    // the same with the table schema.
+    try {
+      Schema readerSchema = schemaUtil.getTableAvroSchema(false);
+      config.setSchema(readerSchema.toString());
+    } catch (Exception e) {
+      // If there is no commit in the table, just ignore the exception.
+    }
+
+    // Compacting is very similar to applying updates to existing file
+    List operations = compactionPlan.getOperations().stream()
+        .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
+    LOG.info("Compactor compacting " + operations + " files");
+
+    context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices");
+    TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier();
+    return context.parallelize(operations).map(operation -> compact(
+        compactionHandler, metaClient, config, operation, compactionInstantTime, taskContextSupplier))
+        .flatMap(List::iterator);
+  }
+
+  /**
+   * Execute a single compaction operation and report back status.
+   */
+  public List compact(HoodieCompactionHandler compactionHandler,
+                                   HoodieTableMetaClient metaClient,
+                                   HoodieWriteConfig config,
+                                   CompactionOperation operation,
+                                   String instantTime,
+                                   TaskContextSupplier taskContextSupplier) throws IOException {
+    FileSystem fs = metaClient.getFs();
+
+    Schema readerSchema = HoodieAvroUtils.addMetadataFields(
+        new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
+    LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames()
+        + " for commit " + instantTime);
+    // TODO - FIX THIS
+    // Reads the entire avro file. Always only specific blocks should be read from the avro file
+    // (failure recover).
+    // Load all the delta commits since the last compaction commit and get all the blocks to be
+    // loaded and load it using CompositeAvroLogReader
+    // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
+    String maxInstantTime = metaClient
+        .getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION,
+            HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION))
+        .filterCompletedInstants().lastInstant().get().getTimestamp();
+    long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(taskContextSupplier, config);
+    LOG.info("MaxMemoryPerCompaction => " + maxMemoryPerCompaction);
+
+    List logFiles = operation.getDeltaFileNames().stream().map(
+        p -> new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), operation.getPartitionPath()), p).toString())
+        .collect(toList());
+    HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
+        .withFileSystem(fs)
+        .withBasePath(metaClient.getBasePath())
+        .withLogFilePaths(logFiles)
+        .withReaderSchema(readerSchema)
+        .withLatestInstantTime(maxInstantTime)
+        .withMaxMemorySizeInBytes(maxMemoryPerCompaction)
+        .withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled())
+        .withReverseReader(config.getCompactionReverseLogReadEnabled())
+        .withBufferSize(config.getMaxDFSStreamBufferSize())
+        .withSpillableMapBasePath(config.getSpillableMapBasePath())
+        .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType())
+        .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled())
+        .build();
+    if (!scanner.iterator().hasNext()) {
+      scanner.close();
+      return new ArrayList<>();
+    }
+
+    Option oldDataFileOpt =
+        operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath());
+
+    // Compacting is very similar to applying updates to existing file
+    Iterator> result;
+    // If the dataFile is present, perform updates else perform inserts into a new base file.
+    if (oldDataFileOpt.isPresent()) {
+      result = compactionHandler.handleUpdate(instantTime, operation.getPartitionPath(),
+          operation.getFileId(), scanner.getRecords(),
+          oldDataFileOpt.get());
+    } else {
+      result = compactionHandler.handleInsert(instantTime, operation.getPartitionPath(), operation.getFileId(),
+          scanner.getRecords());
+    }
+    scanner.close();
+    Iterable> resultIterable = () -> result;
+    return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> {
+      s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog());
+      s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
+      s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
+      s.getStat().setPartitionPath(operation.getPartitionPath());
+      s.getStat()
+          .setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
+      s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
+      s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
+      s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
+      RuntimeStats runtimeStats = new RuntimeStats();
+      runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks());
+      s.getStat().setRuntimeStats(runtimeStats);
+    }).collect(toList());
+  }
+
+  /**
+   * Generate a new compaction plan for scheduling.
+   *
+   * @param context                               HoodieEngineContext
+   * @param hoodieTable                           Hoodie Table
+   * @param config                                Hoodie Write Configuration
+   * @param compactionCommitTime                  scheduled compaction commit time
+   * @param fgIdsInPendingCompactionAndClustering partition-fileId pairs for which compaction is pending
+   * @return Compaction Plan
+   * @throws IOException when encountering errors
+   */
+  HoodieCompactionPlan generateCompactionPlan(
+      HoodieEngineContext context, HoodieTable hoodieTable, HoodieWriteConfig config,
+      String compactionCommitTime, Set fgIdsInPendingCompactionAndClustering) throws IOException {
+    // Accumulator to keep track of total log files for a table
+    HoodieAccumulator totalLogFiles = context.newAccumulator();
+    // Accumulator to keep track of total log file slices for a table
+    HoodieAccumulator totalFileSlices = context.newAccumulator();
+
+    ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
+        "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not "
+            + hoodieTable.getMetaClient().getTableType().name());
+
+    // TODO : check if maxMemory is not greater than JVM or executor memory
+    // TODO - rollback any compactions in flight
+    HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
+    LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
+    List partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath());
+
+    // filter the partition paths if needed to reduce list status
+    partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
+
+    if (partitionPaths.isEmpty()) {
+      // In case no partitions could be picked, return no compaction plan
+      return null;
+    }
+
+    SliceView fileSystemView = hoodieTable.getSliceView();
+    LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
+    context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact");
+
+    List operations = context.flatMap(partitionPaths, partitionPath -> fileSystemView
+        .getLatestFileSlices(partitionPath)
+        .filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId()))
+        .map(s -> {
+          List logFiles =
+              s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(toList());
+          totalLogFiles.add(logFiles.size());
+          totalFileSlices.add(1L);
+          // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
+          // for Map operations and collecting them finally in Avro generated classes for storing
+          // into meta files.
+          Option dataFile = s.getBaseFile();
+          return new CompactionOperation(dataFile, partitionPath, logFiles,
+              config.getCompactionStrategy().captureMetrics(config, s));
+        })
+        .filter(c -> !c.getDeltaFileNames().isEmpty()), partitionPaths.size()).stream()
+        .map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
+
+    LOG.info("Total of " + operations.size() + " compactions are retrieved");
+    LOG.info("Total number of latest files slices " + totalFileSlices.value());
+    LOG.info("Total number of log files " + totalLogFiles.value());
+    LOG.info("Total number of file slices " + totalFileSlices.value());
+    // Filter the compactions with the passed in filter. This lets us choose most effective
+    // compactions only
+    HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
+        CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
+    ValidationUtils.checkArgument(
+        compactionPlan.getOperations().stream().noneMatch(
+            op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))),
+        "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. "
+            + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering
+            + ", Selected workload :" + compactionPlan);
+    if (compactionPlan.getOperations().isEmpty()) {
+      LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
+    }
+    return compactionPlan;
+  }
 }
diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkRunCompactionActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/RunCompactionActionExecutor.java
similarity index 60%
rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkRunCompactionActionExecutor.java
rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/RunCompactionActionExecutor.java
index 5851b08c69e52..5e3005b22fb23 100644
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkRunCompactionActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/RunCompactionActionExecutor.java
@@ -20,64 +20,62 @@
 
 import org.apache.hudi.avro.model.HoodieCompactionPlan;
 import org.apache.hudi.client.WriteStatus;
-import org.apache.hudi.client.common.HoodieSparkEngineContext;
-import org.apache.hudi.client.utils.SparkMemoryUtils;
+import org.apache.hudi.common.data.HoodieData;
+import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.model.HoodieCommitMetadata;
 import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.model.HoodieWriteStat;
-import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
-import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
 import org.apache.hudi.common.util.CompactionUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.exception.HoodieCompactionException;
+import org.apache.hudi.table.HoodieCompactionHandler;
 import org.apache.hudi.table.HoodieTable;
 import org.apache.hudi.table.action.BaseActionExecutor;
 import org.apache.hudi.table.action.HoodieWriteMetadata;
 
-import org.apache.spark.api.java.JavaRDD;
-
 import java.io.IOException;
 import java.util.List;
 
 @SuppressWarnings("checkstyle:LineLength")
-public class SparkRunCompactionActionExecutor extends
-    BaseActionExecutor>, JavaRDD, JavaRDD, HoodieWriteMetadata>> {
+public class RunCompactionActionExecutor extends
+    BaseActionExecutor>, HoodieData, HoodieData, HoodieWriteMetadata>> {
+
+  private final HoodieCompactor compactor;
+  private final HoodieCompactionHandler compactionHandler;
 
-  public SparkRunCompactionActionExecutor(HoodieSparkEngineContext context,
-                                          HoodieWriteConfig config,
-                                          HoodieTable>, JavaRDD, JavaRDD> table,
-                                          String instantTime) {
+  public RunCompactionActionExecutor(HoodieEngineContext context,
+                                     HoodieWriteConfig config,
+                                     HoodieTable table,
+                                     String instantTime,
+                                     HoodieCompactor compactor,
+                                     HoodieCompactionHandler compactionHandler) {
     super(context, config, table, instantTime);
+    this.compactor = compactor;
+    this.compactionHandler = compactionHandler;
   }
 
   @Override
-  public HoodieWriteMetadata> execute() {
-    HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(instantTime);
+  public HoodieWriteMetadata> execute() {
     HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline();
-    if (!pendingCompactionTimeline.containsInstant(instant)) {
-      throw new IllegalStateException(
-          "No Compaction request available at " + instantTime + " to run compaction");
-    }
+    compactor.preCompact(table, pendingCompactionTimeline, instantTime);
 
-    HoodieWriteMetadata> compactionMetadata = new HoodieWriteMetadata<>();
+    HoodieWriteMetadata> compactionMetadata = new HoodieWriteMetadata<>();
     try {
-      HoodieActiveTimeline timeline = table.getActiveTimeline();
+      // generate compaction plan
+      // should support configurable commit metadata
       HoodieCompactionPlan compactionPlan =
           CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime);
-      // Mark instant as compaction inflight
-      timeline.transitionCompactionRequestedToInflight(instant);
-      table.getMetaClient().reloadActiveTimeline();
 
-      HoodieSparkMergeOnReadTableCompactor compactor = new HoodieSparkMergeOnReadTableCompactor();
-      JavaRDD statuses = compactor.compact(context, compactionPlan, table, config, instantTime);
+      HoodieData statuses = compactor.compact(
+          context, compactionPlan, table, config, instantTime, compactionHandler);
 
-      statuses.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
+      compactor.maybePersist(statuses, config);
       context.setJobStatus(this.getClass().getSimpleName(), "Preparing compaction metadata");
-      List updateStatusMap = statuses.map(WriteStatus::getStat).collect();
+      List updateStatusMap = statuses.map(WriteStatus::getStat).collectAsList();
       HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
       for (HoodieWriteStat stat : updateStatusMap) {
         metadata.addWriteStat(stat.getPartitionPath(), stat);
diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/FlinkScheduleCompactionActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java
similarity index 64%
rename from hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/FlinkScheduleCompactionActionExecutor.java
rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java
index 4143944bbebc8..12a00181dcf6a 100644
--- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/FlinkScheduleCompactionActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java
@@ -19,21 +19,22 @@
 package org.apache.hudi.table.action.compact;
 
 import org.apache.hudi.avro.model.HoodieCompactionPlan;
-import org.apache.hudi.client.WriteStatus;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.model.HoodieFileGroupId;
-import org.apache.hudi.common.model.HoodieKey;
-import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
 import org.apache.hudi.common.table.view.SyncableFileSystemView;
 import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.exception.HoodieCompactionException;
+import org.apache.hudi.exception.HoodieIOException;
 import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.action.BaseActionExecutor;
 
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
@@ -45,31 +46,67 @@
 import java.util.Set;
 import java.util.stream.Collectors;
 
-@SuppressWarnings("checkstyle:LineLength")
-public class FlinkScheduleCompactionActionExecutor extends
-    BaseScheduleCompactionActionExecutor>, List, List> {
+public class ScheduleCompactionActionExecutor extends BaseActionExecutor> {
 
-  private static final Logger LOG = LogManager.getLogger(FlinkScheduleCompactionActionExecutor.class);
+  private static final Logger LOG = LogManager.getLogger(ScheduleCompactionActionExecutor.class);
 
   private final Option> extraMetadata;
+  private final HoodieCompactor compactor;
 
-  public FlinkScheduleCompactionActionExecutor(HoodieEngineContext context,
-                                               HoodieWriteConfig config,
-                                               HoodieTable>, List, List> table,
-                                               String instantTime,
-                                               Option> extraMetadata) {
-    super(context, config, table, instantTime, extraMetadata);
+  public ScheduleCompactionActionExecutor(HoodieEngineContext context,
+                                          HoodieWriteConfig config,
+                                          HoodieTable table,
+                                          String instantTime,
+                                          Option> extraMetadata,
+                                          HoodieCompactor compactor) {
+    super(context, config, table, instantTime);
     this.extraMetadata = extraMetadata;
+    this.compactor = compactor;
   }
 
   @Override
-  protected HoodieCompactionPlan scheduleCompaction() {
+  public Option execute() {
+    if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()
+        && !config.getFailedWritesCleanPolicy().isLazy()) {
+      // if there are inflight writes, their instantTime must not be less than that of compaction instant time
+      table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant()
+          .ifPresent(earliestInflight -> ValidationUtils.checkArgument(
+              HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime),
+              "Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight
+                  + ", Compaction scheduled at " + instantTime));
+      // Committed and pending compaction instants should have strictly lower timestamps
+      List conflictingInstants = table.getActiveTimeline()
+          .getWriteTimeline().filterCompletedAndCompactionInstants().getInstants()
+          .filter(instant -> HoodieTimeline.compareTimestamps(
+              instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime))
+          .collect(Collectors.toList());
+      ValidationUtils.checkArgument(conflictingInstants.isEmpty(),
+          "Following instants have timestamps >= compactionInstant (" + instantTime + ") Instants :"
+              + conflictingInstants);
+    }
+
+    HoodieCompactionPlan plan = scheduleCompaction();
+    if (plan != null && (plan.getOperations() != null) && (!plan.getOperations().isEmpty())) {
+      extraMetadata.ifPresent(plan::setExtraMetadata);
+      HoodieInstant compactionInstant =
+          new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime);
+      try {
+        table.getActiveTimeline().saveToCompactionRequested(compactionInstant,
+            TimelineMetadataUtils.serializeCompactionPlan(plan));
+      } catch (IOException ioe) {
+        throw new HoodieIOException("Exception scheduling compaction", ioe);
+      }
+      return Option.of(plan);
+    }
+    return Option.empty();
+  }
+
+  private HoodieCompactionPlan scheduleCompaction() {
     LOG.info("Checking if compaction needs to be run on " + config.getBasePath());
     // judge if we need to compact according to num delta commits and time elapsed
     boolean compactable = needCompact(config.getInlineCompactTriggerStrategy());
     if (compactable) {
       LOG.info("Generating compaction plan for merge on read table " + config.getBasePath());
-      HoodieFlinkMergeOnReadTableCompactor compactor = new HoodieFlinkMergeOnReadTableCompactor();
       try {
         SyncableFileSystemView fileSystemView = (SyncableFileSystemView) table.getSliceView();
         Set fgInPendingCompactionAndClustering = fileSystemView.getPendingCompactionOperations()
@@ -86,7 +123,7 @@ protected HoodieCompactionPlan scheduleCompaction() {
     return new HoodieCompactionPlan();
   }
 
-  public Pair getLatestDeltaCommitInfo(CompactionTriggerStrategy compactionTriggerStrategy) {
+  private Pair getLatestDeltaCommitInfo(CompactionTriggerStrategy compactionTriggerStrategy) {
     Option lastCompaction = table.getActiveTimeline().getCommitTimeline()
         .filterCompletedInstants().lastInstant();
     HoodieTimeline deltaCommits = table.getActiveTimeline().getDeltaCommitTimeline();
@@ -103,7 +140,7 @@ public Pair getLatestDeltaCommitInfo(CompactionTriggerStrategy
     return Pair.of(deltaCommitsSinceLastCompaction, latestInstantTs);
   }
 
-  public boolean needCompact(CompactionTriggerStrategy compactionTriggerStrategy) {
+  private boolean needCompact(CompactionTriggerStrategy compactionTriggerStrategy) {
     boolean compactable;
     // get deltaCommitsSinceLastCompaction and lastCompactionTs
     Pair latestDeltaCommitInfo = getLatestDeltaCommitInfo(compactionTriggerStrategy);
@@ -144,10 +181,10 @@ public boolean needCompact(CompactionTriggerStrategy compactionTriggerStrategy)
     return compactable;
   }
 
-  public Long parsedToSeconds(String time) {
+  private Long parsedToSeconds(String time) {
     long timestamp;
     try {
-      timestamp = HoodieActiveTimeline.COMMIT_FORMATTER.parse(time).getTime() / 1000;
+      timestamp = HoodieActiveTimeline.parseInstantTime(time).getTime() / 1000;
     } catch (ParseException e) {
       throw new HoodieCompactionException(e.getMessage(), e);
     }
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java
index 3b722a7a41ac9..ac8f9940d4b36 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java
@@ -20,6 +20,7 @@
 
 import org.apache.hudi.avro.model.HoodieRestoreMetadata;
 import org.apache.hudi.avro.model.HoodieRollbackMetadata;
+import org.apache.hudi.client.transaction.TransactionManager;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
@@ -27,6 +28,7 @@
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
 import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
 import org.apache.hudi.common.util.HoodieTimer;
+import org.apache.hudi.common.util.Option;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.exception.HoodieRollbackException;
 import org.apache.hudi.table.HoodieTable;
@@ -46,6 +48,7 @@ public abstract class BaseRestoreActionExecutor extends
-    BaseRestoreActionExecutor>, List, List> {
-
-  public JavaCopyOnWriteRestoreActionExecutor(HoodieJavaEngineContext context,
-                                              HoodieWriteConfig config,
-                                              HoodieTable table,
-                                              String instantTime,
-                                              String restoreInstantTime) {
+public class CopyOnWriteRestoreActionExecutor
+    extends BaseRestoreActionExecutor {
+  public CopyOnWriteRestoreActionExecutor(HoodieEngineContext context,
+                                          HoodieWriteConfig config,
+                                          HoodieTable table,
+                                          String instantTime,
+                                          String restoreInstantTime) {
     super(context, config, table, instantTime, restoreInstantTime);
   }
 
   @Override
   protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) {
+    if (!instantToRollback.getAction().equals(HoodieTimeline.COMMIT_ACTION)
+        && !instantToRollback.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) {
+      throw new HoodieRollbackException("Unsupported action in rollback instant:" + instantToRollback);
+    }
     table.getMetaClient().reloadActiveTimeline();
-    JavaCopyOnWriteRollbackActionExecutor rollbackActionExecutor = new JavaCopyOnWriteRollbackActionExecutor(
+    String newInstantTime = HoodieActiveTimeline.createNewInstantTime();
+    table.scheduleRollback(context, newInstantTime, instantToRollback, false);
+    table.getMetaClient().reloadActiveTimeline();
+    CopyOnWriteRollbackActionExecutor rollbackActionExecutor = new CopyOnWriteRollbackActionExecutor(
         context,
         config,
         table,
-        HoodieActiveTimeline.createNewInstantTime(),
+        newInstantTime,
         instantToRollback,
         true,
         true,
+        false,
         false);
-    if (!instantToRollback.getAction().equals(HoodieTimeline.COMMIT_ACTION)
-        && !instantToRollback.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) {
-      throw new HoodieRollbackException("Unsupported action in rollback instant:" + instantToRollback);
-    }
     return rollbackActionExecutor.execute();
   }
 }
diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkMergeOnReadRestoreActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java
similarity index 54%
rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkMergeOnReadRestoreActionExecutor.java
rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java
index c320579380b1d..db6fbc2620155 100644
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkMergeOnReadRestoreActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java
@@ -7,57 +7,37 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
  */
 
 package org.apache.hudi.table.action.restore;
 
 import org.apache.hudi.avro.model.HoodieRollbackMetadata;
-import org.apache.hudi.client.WriteStatus;
-import org.apache.hudi.client.common.HoodieSparkEngineContext;
-import org.apache.hudi.common.model.HoodieKey;
-import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.table.HoodieTable;
-import org.apache.hudi.table.action.rollback.SparkMergeOnReadRollbackActionExecutor;
+import org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor;
 
-import org.apache.spark.api.java.JavaRDD;
-
-@SuppressWarnings("checkstyle:LineLength")
-public class SparkMergeOnReadRestoreActionExecutor extends
-    BaseRestoreActionExecutor>, JavaRDD, JavaRDD> {
-
-  public SparkMergeOnReadRestoreActionExecutor(HoodieSparkEngineContext context,
-                                               HoodieWriteConfig config,
-                                               HoodieTable table,
-                                               String instantTime,
-                                               String restoreInstantTime) {
+public class MergeOnReadRestoreActionExecutor
+    extends BaseRestoreActionExecutor {
+  public MergeOnReadRestoreActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table,
+                                          String instantTime, String restoreInstantTime) {
     super(context, config, table, instantTime, restoreInstantTime);
   }
 
   @Override
   protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) {
-    table.getMetaClient().reloadActiveTimeline();
-    SparkMergeOnReadRollbackActionExecutor rollbackActionExecutor = new SparkMergeOnReadRollbackActionExecutor(
-        context,
-        config,
-        table,
-        HoodieActiveTimeline.createNewInstantTime(),
-        instantToRollback,
-        true,
-        true,
-        false);
-
     switch (instantToRollback.getAction()) {
       case HoodieTimeline.COMMIT_ACTION:
       case HoodieTimeline.DELTA_COMMIT_ACTION:
@@ -66,9 +46,28 @@ protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback
         // TODO : Get file status and create a rollback stat and file
         // TODO : Delete the .aux files along with the instant file, okay for now since the archival process will
         // delete these files when it does not see a corresponding instant file under .hoodie
-        return rollbackActionExecutor.execute();
+        break;
       default:
         throw new IllegalArgumentException("invalid action name " + instantToRollback.getAction());
     }
+    table.getMetaClient().reloadActiveTimeline();
+    String instantTime = HoodieActiveTimeline.createNewInstantTime();
+    table.scheduleRollback(context, instantTime, instantToRollback, false);
+    table.getMetaClient().reloadActiveTimeline();
+    MergeOnReadRollbackActionExecutor rollbackActionExecutor = new MergeOnReadRollbackActionExecutor(
+        context,
+        config,
+        table,
+        instantTime,
+        instantToRollback,
+        true,
+        true,
+        false,
+        false);
+
+    // TODO : Get file status and create a rollback stat and file
+    // TODO : Delete the .aux files along with the instant file, okay for now since the archival process will
+    // delete these files when it does not see a corresponding instant file under .hoodie
+    return rollbackActionExecutor.execute();
   }
 }
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/AbstractMarkerBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/AbstractMarkerBasedRollbackStrategy.java
deleted file mode 100644
index cc596ba3422b7..0000000000000
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/AbstractMarkerBasedRollbackStrategy.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.table.action.rollback;
-
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.hudi.common.HoodieRollbackStat;
-import org.apache.hudi.common.engine.HoodieEngineContext;
-import org.apache.hudi.common.fs.FSUtils;
-import org.apache.hudi.common.model.HoodieLogFile;
-import org.apache.hudi.common.model.HoodieRecordPayload;
-import org.apache.hudi.common.table.log.HoodieLogFormat;
-import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
-import org.apache.hudi.common.table.log.block.HoodieLogBlock;
-import org.apache.hudi.common.table.timeline.HoodieInstant;
-import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.hudi.exception.HoodieIOException;
-import org.apache.hudi.table.HoodieTable;
-import org.apache.log4j.LogManager;
-import org.apache.log4j.Logger;
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Map;
-import java.util.Objects;
-
-/**
- * Performs rollback using marker files generated during the write..
- */
-public abstract class AbstractMarkerBasedRollbackStrategy implements BaseRollbackActionExecutor.RollbackStrategy {
-
-  private static final Logger LOG = LogManager.getLogger(AbstractMarkerBasedRollbackStrategy.class);
-
-  protected final HoodieTable table;
-
-  protected final transient HoodieEngineContext context;
-
-  protected final HoodieWriteConfig config;
-
-  protected final String basePath;
-
-  protected final String instantTime;
-
-  public AbstractMarkerBasedRollbackStrategy(HoodieTable table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) {
-    this.table = table;
-    this.context = context;
-    this.basePath = table.getMetaClient().getBasePath();
-    this.config = config;
-    this.instantTime = instantTime;
-  }
-
-  protected HoodieRollbackStat undoMerge(String mergedBaseFilePath) throws IOException {
-    LOG.info("Rolling back by deleting the merged base file:" + mergedBaseFilePath);
-    return deleteBaseFile(mergedBaseFilePath);
-  }
-
-  protected HoodieRollbackStat undoCreate(String createdBaseFilePath) throws IOException {
-    LOG.info("Rolling back by deleting the created base file:" + createdBaseFilePath);
-    return deleteBaseFile(createdBaseFilePath);
-  }
-
-  private HoodieRollbackStat deleteBaseFile(String baseFilePath) throws IOException {
-    Path fullDeletePath = new Path(basePath, baseFilePath);
-    String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), fullDeletePath.getParent());
-    boolean isDeleted = table.getMetaClient().getFs().delete(fullDeletePath);
-    return HoodieRollbackStat.newBuilder()
-        .withPartitionPath(partitionPath)
-        .withDeletedFileResult(baseFilePath, isDeleted)
-        .build();
-  }
-
-  protected HoodieRollbackStat undoAppend(String appendBaseFilePath, HoodieInstant instantToRollback) throws IOException, InterruptedException {
-    Path baseFilePathForAppend = new Path(basePath, appendBaseFilePath);
-    String fileId = FSUtils.getFileIdFromFilePath(baseFilePathForAppend);
-    String baseCommitTime = FSUtils.getCommitTime(baseFilePathForAppend.getName());
-    String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), new Path(basePath, appendBaseFilePath).getParent());
-    final Map writtenLogFileSizeMap = getWrittenLogFileSizeMap(partitionPath, baseCommitTime, fileId);
-
-    HoodieLogFormat.Writer writer = null;
-    try {
-      Path partitionFullPath = FSUtils.getPartitionPath(basePath, partitionPath);
-
-      if (!table.getMetaClient().getFs().exists(partitionFullPath)) {
-        return HoodieRollbackStat.newBuilder()
-            .withPartitionPath(partitionPath)
-            .build();
-      }
-      writer = HoodieLogFormat.newWriterBuilder()
-          .onParentPath(partitionFullPath)
-          .withFileId(fileId)
-          .overBaseCommit(baseCommitTime)
-          .withFs(table.getMetaClient().getFs())
-          .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
-
-      // generate metadata
-      Map header = RollbackUtils.generateHeader(instantToRollback.getTimestamp(), instantTime);
-      // if update belongs to an existing log file
-      writer.appendBlock(new HoodieCommandBlock(header));
-    } finally {
-      try {
-        if (writer != null) {
-          writer.close();
-        }
-      } catch (IOException io) {
-        throw new HoodieIOException("Error closing append of rollback block..", io);
-      }
-    }
-
-    // the information of files appended to is required for metadata sync
-    Map filesToNumBlocksRollback = Collections.singletonMap(
-          table.getMetaClient().getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()),
-          1L);
-
-    return HoodieRollbackStat.newBuilder()
-        .withPartitionPath(partitionPath)
-        .withRollbackBlockAppendResults(filesToNumBlocksRollback)
-        .withWrittenLogFileSizeMap(writtenLogFileSizeMap).build();
-  }
-
-  /**
-   * Returns written log file size map for the respective baseCommitTime to assist in metadata table syncing.
-   * @param partitionPath partition path of interest
-   * @param baseCommitTime base commit time of interest
-   * @param fileId fileId of interest
-   * @return Map
-   * @throws IOException
-   */
-  protected Map getWrittenLogFileSizeMap(String partitionPath, String baseCommitTime, String fileId) throws IOException {
-    return Collections.EMPTY_MAP;
-  }
-}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java
index 7dbbaa70ee9d4..ff50a2961eafb 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java
@@ -19,7 +19,9 @@
 package org.apache.hudi.table.action.rollback;
 
 import org.apache.hudi.avro.model.HoodieRollbackMetadata;
+import org.apache.hudi.avro.model.HoodieRollbackPlan;
 import org.apache.hudi.client.heartbeat.HoodieHeartbeatClient;
+import org.apache.hudi.client.transaction.TransactionManager;
 import org.apache.hudi.common.HoodieRollbackStat;
 import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
 import org.apache.hudi.common.engine.HoodieEngineContext;
@@ -43,7 +45,6 @@
 import org.apache.log4j.Logger;
 
 import java.io.IOException;
-import java.io.Serializable;
 import java.util.Collections;
 import java.util.List;
 import java.util.Objects;
@@ -53,24 +54,22 @@ public abstract class BaseRollbackActionExecutor execute(HoodieInstant instantToRollback);
-  }
-
   protected final HoodieInstant instantToRollback;
   protected final boolean deleteInstants;
   protected final boolean skipTimelinePublish;
   protected final boolean useMarkerBasedStrategy;
+  private final TransactionManager txnManager;
+  private final boolean skipLocking;
 
   public BaseRollbackActionExecutor(HoodieEngineContext context,
       HoodieWriteConfig config,
       HoodieTable table,
       String instantTime,
       HoodieInstant instantToRollback,
-      boolean deleteInstants) {
+      boolean deleteInstants,
+      boolean skipLocking) {
     this(context, config, table, instantTime, instantToRollback, deleteInstants,
-        false, config.shouldRollbackUsingMarkers());
+        false, config.shouldRollbackUsingMarkers(), skipLocking);
   }
 
   public BaseRollbackActionExecutor(HoodieEngineContext context,
@@ -80,7 +79,8 @@ public BaseRollbackActionExecutor(HoodieEngineContext context,
       HoodieInstant instantToRollback,
       boolean deleteInstants,
       boolean skipTimelinePublish,
-      boolean useMarkerBasedStrategy) {
+      boolean useMarkerBasedStrategy,
+      boolean skipLocking) {
     super(context, config, table, instantTime);
     this.instantToRollback = instantToRollback;
     this.deleteInstants = deleteInstants;
@@ -90,32 +90,78 @@ public BaseRollbackActionExecutor(HoodieEngineContext context,
       ValidationUtils.checkArgument(!instantToRollback.isCompleted(),
           "Cannot use marker based rollback strategy on completed instant:" + instantToRollback);
     }
+    this.skipLocking = skipLocking;
+    this.txnManager = new TransactionManager(config, table.getMetaClient().getFs());
   }
 
-  protected abstract RollbackStrategy getRollbackStrategy();
+  /**
+   * Execute actual rollback and fetch list of RollbackStats.
+   * @param hoodieRollbackPlan instance of {@link HoodieRollbackPlan} that needs to be executed.
+   * @return a list of {@link HoodieRollbackStat}s.
+   * @throws IOException
+   */
+  protected abstract List executeRollback(HoodieRollbackPlan hoodieRollbackPlan) throws IOException;
 
-  protected abstract List executeRollback() throws IOException;
+  private HoodieRollbackMetadata runRollback(HoodieTable table, HoodieInstant rollbackInstant, HoodieRollbackPlan rollbackPlan) {
+    ValidationUtils.checkArgument(rollbackInstant.getState().equals(HoodieInstant.State.REQUESTED)
+        || rollbackInstant.getState().equals(HoodieInstant.State.INFLIGHT));
+    try {
+      final HoodieInstant inflightInstant;
+      final HoodieTimer timer = new HoodieTimer();
+      timer.startTimer();
+      if (rollbackInstant.isRequested()) {
+        inflightInstant = table.getActiveTimeline().transitionRollbackRequestedToInflight(rollbackInstant,
+            TimelineMetadataUtils.serializeRollbackPlan(rollbackPlan));
+      } else {
+        inflightInstant = rollbackInstant;
+      }
 
-  protected abstract List executeRollbackUsingFileListing(HoodieInstant instantToRollback);
+      HoodieTimer rollbackTimer = new HoodieTimer().startTimer();
+      List stats = doRollbackAndGetStats(rollbackPlan);
+      HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.convertRollbackMetadata(
+          instantTime,
+          Option.of(rollbackTimer.endTimer()),
+          Collections.singletonList(instantToRollback),
+          stats);
+      if (!skipTimelinePublish) {
+        finishRollback(inflightInstant, rollbackMetadata);
+      }
+
+      // Finally, remove the markers post rollback.
+      WriteMarkersFactory.get(config.getMarkersType(), table, instantToRollback.getTimestamp())
+          .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
+
+      return rollbackMetadata;
+    } catch (IOException e) {
+      throw new HoodieIOException("Failed to rollback commit ", e);
+    }
+  }
 
   @Override
   public HoodieRollbackMetadata execute() {
-    HoodieTimer rollbackTimer = new HoodieTimer().startTimer();
-    List stats = doRollbackAndGetStats();
-    HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.convertRollbackMetadata(
-        instantTime,
-        Option.of(rollbackTimer.endTimer()),
-        Collections.singletonList(instantToRollback),
-        stats);
-    if (!skipTimelinePublish) {
-      finishRollback(rollbackMetadata);
+    table.getMetaClient().reloadActiveTimeline();
+    List rollBackInstants = table.getRollbackTimeline()
+        .filterInflightsAndRequested().getInstants().collect(Collectors.toList());
+    if (rollBackInstants.isEmpty()) {
+      throw new HoodieRollbackException("No Requested Rollback Instants found to execute rollback ");
+    }
+    HoodieInstant rollbackInstant = null;
+    for (HoodieInstant instant : rollBackInstants) {
+      if (instantTime.equals(instant.getTimestamp())) {
+        rollbackInstant = instant;
+        break;
+      }
+    }
+    if (rollbackInstant != null) {
+      try {
+        HoodieRollbackPlan rollbackPlan = RollbackUtils.getRollbackPlan(table.getMetaClient(), rollbackInstant);
+        return runRollback(table, rollBackInstants.get(0), rollbackPlan);
+      } catch (IOException e) {
+        throw new HoodieIOException("Failed to fetch rollback plan to rollback commit " + rollbackInstant.getTimestamp(), e);
+      }
+    } else {
+      throw new HoodieIOException("No inflight rollback instants found for commit time " + instantTime);
     }
-
-    // Finally, remove the markers post rollback.
-    WriteMarkersFactory.get(config.getMarkersType(), table, instantToRollback.getTimestamp())
-        .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
-
-    return rollbackMetadata;
   }
 
   private void validateSavepointRollbacks() {
@@ -173,7 +219,7 @@ private void rollBackIndex() {
     LOG.info("Index rolled back for commits " + instantToRollback);
   }
 
-  public List doRollbackAndGetStats() {
+  public List doRollbackAndGetStats(HoodieRollbackPlan hoodieRollbackPlan) {
     final String instantTimeToRollback = instantToRollback.getTimestamp();
     final boolean isPendingCompaction = Objects.equals(HoodieTimeline.COMPACTION_ACTION, instantToRollback.getAction())
         && !instantToRollback.isCompleted();
@@ -186,7 +232,7 @@ public List doRollbackAndGetStats() {
     }
 
     try {
-      List stats = executeRollback();
+      List stats = executeRollback(hoodieRollbackPlan);
       LOG.info("Rolled back inflight instant " + instantTimeToRollback);
       if (!isPendingCompaction) {
         rollBackIndex();
@@ -197,12 +243,20 @@ public List doRollbackAndGetStats() {
     }
   }
 
-  protected void finishRollback(HoodieRollbackMetadata rollbackMetadata) throws HoodieIOException {
+  /**
+   * Execute rollback and fetch rollback stats.
+   * @param instantToRollback instant to be rolled back.
+   * @param rollbackPlan instance of {@link HoodieRollbackPlan} for which rollback needs to be executed.
+   * @return list of {@link HoodieRollbackStat}s.
+   */
+  protected List executeRollback(HoodieInstant instantToRollback, HoodieRollbackPlan rollbackPlan) {
+    return new BaseRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackPlan.getRollbackRequests());
+  }
+
+  protected void finishRollback(HoodieInstant inflightInstant, HoodieRollbackMetadata rollbackMetadata) throws HoodieIOException {
     try {
-      table.getActiveTimeline().createNewInstant(
-          new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.ROLLBACK_ACTION, instantTime));
-      table.getActiveTimeline().saveAsComplete(
-          new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, instantTime),
+      writeToMetadata(rollbackMetadata);
+      table.getActiveTimeline().transitionRollbackInflightToComplete(inflightInstant,
           TimelineMetadataUtils.serializeRollbackMetadata(rollbackMetadata));
       LOG.info("Rollback of Commits " + rollbackMetadata.getCommitsRollback() + " is complete");
     } catch (IOException e) {
@@ -210,6 +264,25 @@ protected void finishRollback(HoodieRollbackMetadata rollbackMetadata) throws Ho
     }
   }
 
+  /**
+   * Update metadata table if available. Any update to metadata table happens within data table lock.
+   * @param rollbackMetadata instance of {@link HoodieRollbackMetadata} to be applied to metadata.
+   */
+  private void writeToMetadata(HoodieRollbackMetadata rollbackMetadata) {
+    if (config.isMetadataTableEnabled()) {
+      try {
+        if (!skipLocking) {
+          this.txnManager.beginTransaction(Option.empty(), Option.empty());
+        }
+        writeTableMetadata(rollbackMetadata);
+      } finally {
+        if (!skipLocking) {
+          this.txnManager.endTransaction();
+        }
+      }
+    }
+  }
+
   /**
    * Delete Inflight instant if enabled.
    *
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackHelper.java
new file mode 100644
index 0000000000000..078d9ac27d389
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackHelper.java
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.rollback;
+
+import org.apache.hudi.avro.model.HoodieRollbackRequest;
+import org.apache.hudi.common.HoodieRollbackStat;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.function.SerializableFunction;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.log.HoodieLogFormat;
+import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
+import org.apache.hudi.common.table.log.block.HoodieLogBlock;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.exception.HoodieRollbackException;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * Contains common methods to be used across engines for rollback operation.
+ */
+public class BaseRollbackHelper implements Serializable {
+
+  private static final Logger LOG = LogManager.getLogger(BaseRollbackHelper.class);
+  protected static final String EMPTY_STRING = "";
+
+  protected final HoodieTableMetaClient metaClient;
+  protected final HoodieWriteConfig config;
+
+  public BaseRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
+    this.metaClient = metaClient;
+    this.config = config;
+  }
+
+  /**
+   * Performs all rollback actions that we have collected in parallel.
+   */
+  public List performRollback(HoodieEngineContext context, HoodieInstant instantToRollback,
+                                                  List rollbackRequests) {
+    int parallelism = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1);
+    context.setJobStatus(this.getClass().getSimpleName(), "Perform rollback actions");
+    // If not for conversion to HoodieRollbackInternalRequests, code fails. Using avro model (HoodieRollbackRequest) within spark.parallelize
+    // is failing with com.esotericsoftware.kryo.KryoException
+    // stack trace: https://gist.github.com/nsivabalan/b6359e7d5038484f8043506c8bc9e1c8
+    // related stack overflow post: https://issues.apache.org/jira/browse/SPARK-3601. Avro deserializes list as GenericData.Array.
+    List serializableRequests = rollbackRequests.stream().map(SerializableHoodieRollbackRequest::new).collect(Collectors.toList());
+    return context.reduceByKey(maybeDeleteAndCollectStats(context, instantToRollback, serializableRequests, true, parallelism),
+        RollbackUtils::mergeRollbackStat, parallelism);
+  }
+
+  /**
+   * Collect all file info that needs to be rollbacked.
+   */
+  public List collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback,
+                                                       List rollbackRequests) {
+    int parallelism = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1);
+    context.setJobStatus(this.getClass().getSimpleName(), "Collect rollback stats for upgrade/downgrade");
+    // If not for conversion to HoodieRollbackInternalRequests, code fails. Using avro model (HoodieRollbackRequest) within spark.parallelize
+    // is failing with com.esotericsoftware.kryo.KryoException
+    // stack trace: https://gist.github.com/nsivabalan/b6359e7d5038484f8043506c8bc9e1c8
+    // related stack overflow post: https://issues.apache.org/jira/browse/SPARK-3601. Avro deserializes list as GenericData.Array.
+    List serializableRequests = rollbackRequests.stream().map(SerializableHoodieRollbackRequest::new).collect(Collectors.toList());
+    return context.reduceByKey(maybeDeleteAndCollectStats(context, instantToRollback, serializableRequests, false, parallelism),
+        RollbackUtils::mergeRollbackStat, parallelism);
+  }
+
+  /**
+   * May be delete interested files and collect stats or collect stats only.
+   *
+   * @param context           instance of {@link HoodieEngineContext} to use.
+   * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested.
+   * @param rollbackRequests  List of {@link ListingBasedRollbackRequest} to be operated on.
+   * @param doDelete          {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes.
+   * @return stats collected with or w/o actual deletions.
+   */
+  List> maybeDeleteAndCollectStats(HoodieEngineContext context,
+                                                                    HoodieInstant instantToRollback,
+                                                                    List rollbackRequests,
+                                                                    boolean doDelete, int numPartitions) {
+    return context.flatMap(rollbackRequests, (SerializableFunction>>) rollbackRequest -> {
+      List filesToBeDeleted = rollbackRequest.getFilesToBeDeleted();
+      if (!filesToBeDeleted.isEmpty()) {
+        List rollbackStats = deleteFiles(metaClient, filesToBeDeleted, doDelete);
+        List> partitionToRollbackStats = new ArrayList<>();
+        rollbackStats.forEach(entry -> partitionToRollbackStats.add(Pair.of(entry.getPartitionPath(), entry)));
+        return partitionToRollbackStats.stream();
+      } else if (!rollbackRequest.getLogBlocksToBeDeleted().isEmpty()) {
+        Map logFilesToBeDeleted = rollbackRequest.getLogBlocksToBeDeleted();
+        String fileId = rollbackRequest.getFileId();
+        String latestBaseInstant = rollbackRequest.getLatestBaseInstant();
+        FileSystem fs = metaClient.getFs();
+        // collect all log files that is supposed to be deleted with this rollback
+        // what happens if file was deleted when invoking fs.getFileStatus(?) below.
+        // I understand we don't delete log files. but just curious if we need to handle this case.
+        Map writtenLogFileSizeMap = new HashMap<>();
+        for (Map.Entry entry : logFilesToBeDeleted.entrySet()) {
+          writtenLogFileSizeMap.put(fs.getFileStatus(new Path(entry.getKey())), entry.getValue());
+        }
+        HoodieLogFormat.Writer writer = null;
+        try {
+          writer = HoodieLogFormat.newWriterBuilder()
+              .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath()))
+              .withFileId(fileId)
+              .overBaseCommit(latestBaseInstant)
+              .withFs(metaClient.getFs())
+              .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
+
+          // generate metadata
+          if (doDelete) {
+            Map header = generateHeader(instantToRollback.getTimestamp());
+            // if update belongs to an existing log file
+            writer.appendBlock(new HoodieCommandBlock(header));
+          }
+        } catch (IOException | InterruptedException io) {
+          throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io);
+        } finally {
+          try {
+            if (writer != null) {
+              writer.close();
+            }
+          } catch (IOException io) {
+            throw new HoodieIOException("Error appending rollback block..", io);
+          }
+        }
+
+        // This step is intentionally done after writer is closed. Guarantees that
+        // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in
+        // cloud-storage : HUDI-168
+        Map filesToNumBlocksRollback = Collections.singletonMap(
+            metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()),
+            1L
+        );
+        return Collections.singletonList(Pair.of(rollbackRequest.getPartitionPath(),
+            HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath())
+                .withRollbackBlockAppendResults(filesToNumBlocksRollback)
+                .withWrittenLogFileSizeMap(writtenLogFileSizeMap).build())).stream();
+      } else {
+        return Collections
+            .singletonList(Pair.of(rollbackRequest.getPartitionPath(),
+                HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath())
+                    .build())).stream();
+      }
+    }, numPartitions);
+  }
+
+  /**
+   * Common method used for cleaning out files during rollback.
+   */
+  protected List deleteFiles(HoodieTableMetaClient metaClient, List filesToBeDeleted, boolean doDelete) throws IOException {
+    return filesToBeDeleted.stream().map(fileToDelete -> {
+      String basePath = metaClient.getBasePath();
+      try {
+        Path fullDeletePath = new Path(fileToDelete);
+        String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), fullDeletePath.getParent());
+        boolean isDeleted = true;
+        if (doDelete) {
+          try {
+            isDeleted = metaClient.getFs().delete(fullDeletePath);
+          } catch (FileNotFoundException e) {
+            // if first rollback attempt failed and retried again, chances that some files are already deleted.
+            isDeleted = true;
+          }
+        }
+        return HoodieRollbackStat.newBuilder()
+            .withPartitionPath(partitionPath)
+            .withDeletedFileResult(fullDeletePath.toString(), isDeleted)
+            .build();
+      } catch (IOException e) {
+        LOG.error("Fetching file status for ");
+        throw new HoodieIOException("Fetching file status for " + fileToDelete + " failed ", e);
+      }
+    }).collect(Collectors.toList());
+  }
+
+  protected Map generateHeader(String commit) {
+    // generate metadata
+    Map header = new HashMap<>(3);
+    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
+    header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit);
+    header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE,
+        String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
+    return header;
+  }
+
+  public interface SerializablePathFilter extends PathFilter, Serializable {
+
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackPlanActionExecutor.java
new file mode 100644
index 0000000000000..24edde27642cc
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackPlanActionExecutor.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.rollback;
+
+import org.apache.hudi.avro.model.HoodieInstantInfo;
+import org.apache.hudi.avro.model.HoodieRollbackPlan;
+import org.apache.hudi.avro.model.HoodieRollbackRequest;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.action.BaseActionExecutor;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Base rollback plan action executor to assist in scheduling rollback requests. This phase serialized {@link HoodieRollbackPlan}
+ * to rollback.requested instant.
+ */
+public class BaseRollbackPlanActionExecutor extends BaseActionExecutor> {
+
+  private static final Logger LOG = LogManager.getLogger(BaseRollbackPlanActionExecutor.class);
+
+  protected final HoodieInstant instantToRollback;
+  private final boolean skipTimelinePublish;
+
+  public static final Integer ROLLBACK_PLAN_VERSION_1 = 1;
+  public static final Integer LATEST_ROLLBACK_PLAN_VERSION = ROLLBACK_PLAN_VERSION_1;
+
+  public BaseRollbackPlanActionExecutor(HoodieEngineContext context,
+                                        HoodieWriteConfig config,
+                                        HoodieTable table,
+                                        String instantTime,
+                                        HoodieInstant instantToRollback,
+                                        boolean skipTimelinePublish) {
+    super(context, config, table, instantTime);
+    this.instantToRollback = instantToRollback;
+    this.skipTimelinePublish = skipTimelinePublish;
+  }
+
+  /**
+   * Interface for RollbackStrategy. There are two types supported, listing based and marker based.
+   */
+  interface RollbackStrategy extends Serializable {
+
+    /**
+     * Fetch list of {@link HoodieRollbackRequest}s to be added to rollback plan.
+     * @param instantToRollback instant to be rolled back.
+     * @return list of {@link HoodieRollbackRequest}s to be added to rollback plan
+     */
+    List getRollbackRequests(HoodieInstant instantToRollback);
+  }
+
+  /**
+   * Fetch the Rollback strategy used.
+   *
+   * @return
+   */
+  private BaseRollbackPlanActionExecutor.RollbackStrategy getRollbackStrategy() {
+    if (config.shouldRollbackUsingMarkers()) {
+      return new MarkerBasedRollbackStrategy(table, context, config, instantTime);
+    } else {
+      return new ListingBasedRollbackStrategy(table, context, config, instantTime);
+    }
+  }
+
+  /**
+   * Creates a Rollback plan if there are files to be rolledback and stores them in instant file.
+   * Rollback Plan contains absolute file paths.
+   *
+   * @param startRollbackTime Rollback Instant Time
+   * @return Rollback Plan if generated
+   */
+  protected Option requestRollback(String startRollbackTime) {
+    final HoodieInstant rollbackInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime);
+    try {
+      List rollbackRequests = new ArrayList<>();
+      if (!instantToRollback.isRequested()) {
+        rollbackRequests.addAll(getRollbackStrategy().getRollbackRequests(instantToRollback));
+      }
+      HoodieRollbackPlan rollbackPlan = new HoodieRollbackPlan(new HoodieInstantInfo(instantToRollback.getTimestamp(),
+          instantToRollback.getAction()), rollbackRequests, LATEST_ROLLBACK_PLAN_VERSION);
+      if (!skipTimelinePublish) {
+        if (table.getRollbackTimeline().filterInflightsAndRequested().containsInstant(rollbackInstant.getTimestamp())) {
+          LOG.warn("Request Rollback found with instant time " + rollbackInstant + ", hence skipping scheduling rollback");
+        } else {
+          table.getActiveTimeline().saveToRollbackRequested(rollbackInstant, TimelineMetadataUtils.serializeRollbackPlan(rollbackPlan));
+          table.getMetaClient().reloadActiveTimeline();
+          LOG.info("Requesting Rollback with instant time " + rollbackInstant);
+        }
+      }
+      return Option.of(rollbackPlan);
+    } catch (IOException e) {
+      LOG.error("Got exception when saving rollback requested file", e);
+      throw new HoodieIOException(e.getMessage(), e);
+    }
+  }
+
+  @Override
+  public Option execute() {
+    // Plan a new rollback action
+    return requestRollback(instantTime);
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java
similarity index 62%
rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseCopyOnWriteRollbackActionExecutor.java
rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java
index fa74f7f6e86bc..5e11354303f26 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseCopyOnWriteRollbackActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java
@@ -18,6 +18,7 @@
 
 package org.apache.hudi.table.action.rollback;
 
+import org.apache.hudi.avro.model.HoodieRollbackPlan;
 import org.apache.hudi.common.HoodieRollbackStat;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -33,32 +34,34 @@
 import java.util.ArrayList;
 import java.util.List;
 
-public abstract class BaseCopyOnWriteRollbackActionExecutor extends BaseRollbackActionExecutor {
+public class CopyOnWriteRollbackActionExecutor extends BaseRollbackActionExecutor {
 
-  private static final Logger LOG = LogManager.getLogger(BaseCopyOnWriteRollbackActionExecutor.class);
+  private static final Logger LOG = LogManager.getLogger(CopyOnWriteRollbackActionExecutor.class);
 
-  public BaseCopyOnWriteRollbackActionExecutor(HoodieEngineContext context,
-                                               HoodieWriteConfig config,
-                                               HoodieTable table,
-                                               String instantTime,
-                                               HoodieInstant commitInstant,
-                                               boolean deleteInstants) {
-    super(context, config, table, instantTime, commitInstant, deleteInstants);
+  public CopyOnWriteRollbackActionExecutor(HoodieEngineContext context,
+                                           HoodieWriteConfig config,
+                                           HoodieTable table,
+                                           String instantTime,
+                                           HoodieInstant commitInstant,
+                                           boolean deleteInstants,
+                                           boolean skipLocking) {
+    super(context, config, table, instantTime, commitInstant, deleteInstants, skipLocking);
   }
 
-  public BaseCopyOnWriteRollbackActionExecutor(HoodieEngineContext context,
-                                               HoodieWriteConfig config,
-                                               HoodieTable table,
-                                               String instantTime,
-                                               HoodieInstant commitInstant,
-                                               boolean deleteInstants,
-                                               boolean skipTimelinePublish,
-                                               boolean useMarkerBasedStrategy) {
-    super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy);
+  public CopyOnWriteRollbackActionExecutor(HoodieEngineContext context,
+                                           HoodieWriteConfig config,
+                                           HoodieTable table,
+                                           String instantTime,
+                                           HoodieInstant commitInstant,
+                                           boolean deleteInstants,
+                                           boolean skipTimelinePublish,
+                                           boolean useMarkerBasedStrategy,
+                                           boolean skipLocking) {
+    super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy, skipLocking);
   }
 
   @Override
-  protected List executeRollback() {
+  protected List executeRollback(HoodieRollbackPlan hoodieRollbackPlan) {
     HoodieTimer rollbackTimer = new HoodieTimer();
     rollbackTimer.startTimer();
 
@@ -78,7 +81,7 @@ protected List executeRollback() {
     if (!resolvedInstant.isRequested()) {
       // delete all the data files for this commit
       LOG.info("Clean out all base files generated for commit: " + resolvedInstant);
-      stats = getRollbackStrategy().execute(resolvedInstant);
+      stats = executeRollback(resolvedInstant, hoodieRollbackPlan);
     }
 
     dropBootstrapIndexIfNeeded(instantToRollback);
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java
new file mode 100644
index 0000000000000..b47136fa02a58
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.table.action.rollback;
+
+import org.apache.hudi.avro.model.HoodieRollbackRequest;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieFileFormat;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.config.HoodieWriteConfig;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import static org.apache.hudi.table.action.rollback.BaseRollbackHelper.EMPTY_STRING;
+
+/**
+ * Performs Rollback of Hoodie Tables.
+ */
+public class ListingBasedRollbackHelper implements Serializable {
+  private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackHelper.class);
+
+  private final HoodieTableMetaClient metaClient;
+  private final HoodieWriteConfig config;
+
+  public ListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
+    this.metaClient = metaClient;
+    this.config = config;
+  }
+
+  /**
+   * Collects info for Rollback plan.
+   */
+  public List getRollbackRequestsForRollbackPlan(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) {
+    int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1);
+    context.setJobStatus(this.getClass().getSimpleName(), "Creating Rollback Plan");
+    return getListingBasedRollbackRequests(context, instantToRollback, rollbackRequests, sparkPartitions);
+  }
+
+  /**
+   * May be delete interested files and collect stats or collect stats only.
+   *
+   * @param context           instance of {@link HoodieEngineContext} to use.
+   * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested.
+   * @param rollbackRequests  List of {@link ListingBasedRollbackRequest} to be operated on.
+   * @param numPartitions     number of spark partitions to use for parallelism.
+   * @return stats collected with or w/o actual deletions.
+   */
+  private List getListingBasedRollbackRequests(HoodieEngineContext context, HoodieInstant instantToRollback,
+                                                                      List rollbackRequests, int numPartitions) {
+    return context.map(rollbackRequests, rollbackRequest -> {
+      switch (rollbackRequest.getType()) {
+        case DELETE_DATA_FILES_ONLY: {
+          final FileStatus[] filesToDeletedStatus = getBaseFilesToBeDeleted(metaClient, config, instantToRollback.getTimestamp(),
+              rollbackRequest.getPartitionPath(), metaClient.getFs());
+          List filesToBeDeleted = Arrays.stream(filesToDeletedStatus).map(fileStatus -> {
+            String fileToBeDeleted = fileStatus.getPath().toString();
+            // strip scheme
+            return fileToBeDeleted.substring(fileToBeDeleted.indexOf(":") + 1);
+          }).collect(Collectors.toList());
+          return new HoodieRollbackRequest(rollbackRequest.getPartitionPath(),
+              EMPTY_STRING, EMPTY_STRING, filesToBeDeleted, Collections.EMPTY_MAP);
+        }
+        case DELETE_DATA_AND_LOG_FILES: {
+          final FileStatus[] filesToDeletedStatus = getBaseAndLogFilesToBeDeleted(instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath(), metaClient.getFs());
+          List filesToBeDeleted = Arrays.stream(filesToDeletedStatus).map(fileStatus -> {
+            String fileToBeDeleted = fileStatus.getPath().toString();
+            // strip scheme
+            return fileToBeDeleted.substring(fileToBeDeleted.indexOf(":") + 1);
+          }).collect(Collectors.toList());
+          return new HoodieRollbackRequest(rollbackRequest.getPartitionPath(), EMPTY_STRING, EMPTY_STRING, filesToBeDeleted, Collections.EMPTY_MAP);
+        }
+        case APPEND_ROLLBACK_BLOCK: {
+          String fileId = rollbackRequest.getFileId().get();
+          String latestBaseInstant = rollbackRequest.getLatestBaseInstant().get();
+          // collect all log files that is supposed to be deleted with this rollback
+          Map writtenLogFileSizeMap = FSUtils.getAllLogFiles(metaClient.getFs(),
+              FSUtils.getPartitionPath(config.getBasePath(), rollbackRequest.getPartitionPath()),
+              fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), latestBaseInstant)
+              .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen()));
+          Map logFilesToBeDeleted = new HashMap<>();
+          for (Map.Entry fileToBeDeleted : writtenLogFileSizeMap.entrySet()) {
+            logFilesToBeDeleted.put(fileToBeDeleted.getKey().getPath().toString(), fileToBeDeleted.getValue());
+          }
+          return new HoodieRollbackRequest(rollbackRequest.getPartitionPath(), fileId, latestBaseInstant,
+              Collections.EMPTY_LIST, logFilesToBeDeleted);
+        }
+        default:
+          throw new IllegalStateException("Unknown Rollback action " + rollbackRequest);
+      }
+    }, numPartitions).stream().collect(Collectors.toList());
+  }
+
+  private FileStatus[] getBaseFilesToBeDeleted(HoodieTableMetaClient metaClient, HoodieWriteConfig config,
+                                               String commit, String partitionPath, FileSystem fs) throws IOException {
+    LOG.info("Collecting files to be cleaned/rolledback up for path " + partitionPath + " and commit " + commit);
+    String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
+    PathFilter filter = (path) -> {
+      if (path.toString().contains(basefileExtension)) {
+        String fileCommitTime = FSUtils.getCommitTime(path.getName());
+        return commit.equals(fileCommitTime);
+      }
+      return false;
+    };
+    return fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
+  }
+
+  private FileStatus[] getBaseAndLogFilesToBeDeleted(String commit, String partitionPath, FileSystem fs) throws IOException {
+    String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
+    BaseRollbackHelper.SerializablePathFilter filter = (path) -> {
+      if (path.toString().endsWith(basefileExtension)) {
+        String fileCommitTime = FSUtils.getCommitTime(path.getName());
+        return commit.equals(fileCommitTime);
+      } else if (FSUtils.isLogFile(path)) {
+        // Since the baseCommitTime is the only commit for new log files, it's okay here
+        String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path);
+        return commit.equals(fileCommitTime);
+      }
+      return false;
+    };
+    return fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java
new file mode 100644
index 0000000000000..266fa39cb986e
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.rollback;
+
+import org.apache.hudi.avro.model.HoodieRollbackRequest;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieRollbackException;
+import org.apache.hudi.table.HoodieTable;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Listing based rollback strategy to fetch list of {@link HoodieRollbackRequest}s.
+ */
+public class ListingBasedRollbackStrategy implements BaseRollbackPlanActionExecutor.RollbackStrategy {
+
+  private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackStrategy.class);
+
+  protected final HoodieTable table;
+  protected final HoodieEngineContext context;
+  protected final HoodieWriteConfig config;
+  protected final String instantTime;
+
+  public ListingBasedRollbackStrategy(HoodieTable table,
+                                      HoodieEngineContext context,
+                                      HoodieWriteConfig config,
+                                      String instantTime) {
+    this.table = table;
+    this.context = context;
+    this.config = config;
+    this.instantTime = instantTime;
+  }
+
+  @Override
+  public List getRollbackRequests(HoodieInstant instantToRollback) {
+    try {
+      List rollbackRequests = null;
+      if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
+        rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context,
+            table.getMetaClient().getBasePath(), config);
+      } else {
+        rollbackRequests = RollbackUtils
+            .generateRollbackRequestsUsingFileListingMOR(instantToRollback, table, context);
+      }
+      List listingBasedRollbackRequests = new ListingBasedRollbackHelper(table.getMetaClient(), config)
+          .getRollbackRequestsForRollbackPlan(context, instantToRollback, rollbackRequests);
+      return listingBasedRollbackRequests;
+    } catch (IOException e) {
+      LOG.error("Generating rollback requests failed for " + instantToRollback.getTimestamp(), e);
+      throw new HoodieRollbackException("Generating rollback requests failed for " + instantToRollback.getTimestamp(), e);
+    }
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java
new file mode 100644
index 0000000000000..9d04e3036f204
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.rollback;
+
+import org.apache.hudi.avro.model.HoodieRollbackRequest;
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieFileFormat;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.model.IOType;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieRollbackException;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.marker.MarkerBasedRollbackUtils;
+import org.apache.hudi.table.marker.WriteMarkers;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import static org.apache.hudi.table.action.rollback.BaseRollbackHelper.EMPTY_STRING;
+
+/**
+ * Performs rollback using marker files generated during the write..
+ */
+public class MarkerBasedRollbackStrategy implements BaseRollbackPlanActionExecutor.RollbackStrategy {
+
+  private static final Logger LOG = LogManager.getLogger(MarkerBasedRollbackStrategy.class);
+
+  protected final HoodieTable table;
+
+  protected final transient HoodieEngineContext context;
+
+  protected final HoodieWriteConfig config;
+
+  protected final String basePath;
+
+  protected final String instantTime;
+
+  public MarkerBasedRollbackStrategy(HoodieTable table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) {
+    this.table = table;
+    this.context = context;
+    this.basePath = table.getMetaClient().getBasePath();
+    this.config = config;
+    this.instantTime = instantTime;
+  }
+
+  @Override
+  public List getRollbackRequests(HoodieInstant instantToRollback) {
+    try {
+      List markerPaths = MarkerBasedRollbackUtils.getAllMarkerPaths(
+          table, context, instantToRollback.getTimestamp(), config.getRollbackParallelism());
+      int parallelism = Math.max(Math.min(markerPaths.size(), config.getRollbackParallelism()), 1);
+      return context.map(markerPaths, markerFilePath -> {
+        String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1);
+        IOType type = IOType.valueOf(typeStr);
+        switch (type) {
+          case MERGE:
+          case CREATE:
+            String fileToDelete = WriteMarkers.stripMarkerSuffix(markerFilePath);
+            Path fullDeletePath = new Path(basePath, fileToDelete);
+            String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), fullDeletePath.getParent());
+            return new HoodieRollbackRequest(partitionPath, EMPTY_STRING, EMPTY_STRING,
+                Collections.singletonList(fullDeletePath.toString()),
+                Collections.emptyMap());
+          case APPEND:
+            return getRollbackRequestForAppend(WriteMarkers.stripMarkerSuffix(markerFilePath));
+          default:
+            throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback);
+        }
+      }, parallelism).stream().collect(Collectors.toList());
+    } catch (Exception e) {
+      throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e);
+    }
+  }
+
+  protected HoodieRollbackRequest getRollbackRequestForAppend(String appendBaseFilePath) throws IOException {
+    Path baseFilePathForAppend = new Path(basePath, appendBaseFilePath);
+    String fileId = FSUtils.getFileIdFromFilePath(baseFilePathForAppend);
+    String baseCommitTime = FSUtils.getCommitTime(baseFilePathForAppend.getName());
+    String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), new Path(basePath, appendBaseFilePath).getParent());
+    Map writtenLogFileSizeMap = getWrittenLogFileSizeMap(partitionPath, baseCommitTime, fileId);
+    Map writtenLogFileStrSizeMap = new HashMap<>();
+    for (Map.Entry entry : writtenLogFileSizeMap.entrySet()) {
+      writtenLogFileStrSizeMap.put(entry.getKey().getPath().toString(), entry.getValue());
+    }
+    return new HoodieRollbackRequest(partitionPath, fileId, baseCommitTime, Collections.emptyList(), writtenLogFileStrSizeMap);
+  }
+
+  /**
+   * Returns written log file size map for the respective baseCommitTime to assist in metadata table syncing.
+   *
+   * @param partitionPathStr partition path of interest
+   * @param baseCommitTime   base commit time of interest
+   * @param fileId           fileId of interest
+   * @return Map
+   * @throws IOException
+   */
+  private Map getWrittenLogFileSizeMap(String partitionPathStr, String baseCommitTime, String fileId) throws IOException {
+    // collect all log files that is supposed to be deleted with this rollback
+    return FSUtils.getAllLogFiles(table.getMetaClient().getFs(),
+        FSUtils.getPartitionPath(config.getBasePath(), partitionPathStr), fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), baseCommitTime)
+        .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen()));
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java
similarity index 57%
rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseMergeOnReadRollbackActionExecutor.java
rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java
index 2e751443abc00..c2b25ffc5bf5a 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseMergeOnReadRollbackActionExecutor.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java
@@ -7,17 +7,19 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
  */
 
 package org.apache.hudi.table.action.rollback;
 
+import org.apache.hudi.avro.model.HoodieRollbackPlan;
 import org.apache.hudi.common.HoodieRollbackStat;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -25,38 +27,41 @@
 import org.apache.hudi.common.util.HoodieTimer;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.table.HoodieTable;
+
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 
 import java.util.ArrayList;
 import java.util.List;
 
-public abstract class BaseMergeOnReadRollbackActionExecutor extends BaseRollbackActionExecutor {
+public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecutor {
 
-  private static final Logger LOG = LogManager.getLogger(BaseMergeOnReadRollbackActionExecutor.class);
+  private static final Logger LOG = LogManager.getLogger(MergeOnReadRollbackActionExecutor.class);
 
-  public BaseMergeOnReadRollbackActionExecutor(HoodieEngineContext context,
-                                               HoodieWriteConfig config,
-                                               HoodieTable table,
-                                               String instantTime,
-                                               HoodieInstant commitInstant,
-                                               boolean deleteInstants) {
-    super(context, config, table, instantTime, commitInstant, deleteInstants);
+  public MergeOnReadRollbackActionExecutor(HoodieEngineContext context,
+                                           HoodieWriteConfig config,
+                                           HoodieTable table,
+                                           String instantTime,
+                                           HoodieInstant commitInstant,
+                                           boolean deleteInstants,
+                                           boolean skipLocking) {
+    super(context, config, table, instantTime, commitInstant, deleteInstants, skipLocking);
   }
 
-  public BaseMergeOnReadRollbackActionExecutor(HoodieEngineContext context,
-                                               HoodieWriteConfig config,
-                                               HoodieTable table,
-                                               String instantTime,
-                                               HoodieInstant commitInstant,
-                                               boolean deleteInstants,
-                                               boolean skipTimelinePublish,
-                                               boolean useMarkerBasedStrategy) {
-    super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy);
+  public MergeOnReadRollbackActionExecutor(HoodieEngineContext context,
+                                           HoodieWriteConfig config,
+                                           HoodieTable table,
+                                           String instantTime,
+                                           HoodieInstant commitInstant,
+                                           boolean deleteInstants,
+                                           boolean skipTimelinePublish,
+                                           boolean useMarkerBasedStrategy,
+                                           boolean skipLocking) {
+    super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy, skipLocking);
   }
 
   @Override
-  protected List executeRollback() {
+  protected List executeRollback(HoodieRollbackPlan hoodieRollbackPlan) {
     HoodieTimer rollbackTimer = new HoodieTimer();
     rollbackTimer.startTimer();
 
@@ -83,7 +88,7 @@ protected List executeRollback() {
     // deleting the timeline file
     if (!resolvedInstant.isRequested()) {
       LOG.info("Unpublished " + resolvedInstant);
-      allRollbackStats = getRollbackStrategy().execute(resolvedInstant);
+      allRollbackStats = executeRollback(instantToRollback, hoodieRollbackPlan);
     }
 
     dropBootstrapIndexIfNeeded(resolvedInstant);
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java
index d213fb18fcf7b..6ad4e1c986fb5 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java
@@ -22,17 +22,20 @@
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 
+import org.apache.hudi.avro.model.HoodieRollbackPlan;
 import org.apache.hudi.common.HoodieRollbackStat;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.FileSlice;
 import org.apache.hudi.common.model.HoodieCommitMetadata;
 import org.apache.hudi.common.model.HoodieWriteStat;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
 import org.apache.hudi.common.table.log.block.HoodieLogBlock;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.ValidationUtils;
 import org.apache.hudi.config.HoodieWriteConfig;
@@ -51,6 +54,20 @@ public class RollbackUtils {
 
   private static final Logger LOG = LogManager.getLogger(RollbackUtils.class);
 
+  /**
+   * Get Latest version of Rollback plan corresponding to a clean instant.
+   * @param metaClient  Hoodie Table Meta Client
+   * @param rollbackInstant Instant referring to rollback action
+   * @return Rollback plan corresponding to rollback instant
+   * @throws IOException
+   */
+  static HoodieRollbackPlan getRollbackPlan(HoodieTableMetaClient metaClient, HoodieInstant rollbackInstant)
+      throws IOException {
+    // TODO: add upgrade step if required.
+    return TimelineMetadataUtils.deserializeAvroMetadata(
+        metaClient.getActiveTimeline().readRollbackInfoAsBytes(rollbackInstant).get(), HoodieRollbackPlan.class);
+  }
+
   static Map generateHeader(String instantToRollback, String rollbackInstantTime) {
     // generate metadata
     Map header = new HashMap<>(3);
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java
new file mode 100644
index 0000000000000..acd1c50badbc7
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.rollback;
+
+import org.apache.hudi.avro.model.HoodieRollbackRequest;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * HoodieRollbackRequest in HoodieRollbackPlan (avro pojo) is not operable direclty within spark parallel engine.
+ * Hence converting the same to this {@link SerializableHoodieRollbackRequest} and then using it within spark.parallelize.
+ */
+public class SerializableHoodieRollbackRequest {
+
+  private final String partitionPath;
+  private final String fileId;
+  private final String latestBaseInstant;
+  private final List filesToBeDeleted = new ArrayList<>();
+  private final Map logBlocksToBeDeleted = new HashMap<>();
+
+  public SerializableHoodieRollbackRequest(HoodieRollbackRequest rollbackRequest) {
+    this.partitionPath = rollbackRequest.getPartitionPath();
+    this.fileId = rollbackRequest.getFileId();
+    this.latestBaseInstant = rollbackRequest.getLatestBaseInstant();
+    this.filesToBeDeleted.addAll(rollbackRequest.getFilesToBeDeleted());
+    this.logBlocksToBeDeleted.putAll(rollbackRequest.getLogBlocksToBeDeleted());
+  }
+
+  public String getPartitionPath() {
+    return partitionPath;
+  }
+
+  public String getFileId() {
+    return fileId;
+  }
+
+  public String getLatestBaseInstant() {
+    return latestBaseInstant;
+  }
+
+  public List getFilesToBeDeleted() {
+    return filesToBeDeleted;
+  }
+
+  public Map getLogBlocksToBeDeleted() {
+    return logBlocksToBeDeleted;
+  }
+}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/AbstractUpgradeDowngrade.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/AbstractUpgradeDowngrade.java
deleted file mode 100644
index 0a74689c5be3e..0000000000000
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/AbstractUpgradeDowngrade.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.table.upgrade;
-
-import org.apache.hudi.common.config.ConfigProperty;
-import org.apache.hudi.common.engine.HoodieEngineContext;
-import org.apache.hudi.common.table.HoodieTableConfig;
-import org.apache.hudi.common.table.HoodieTableMetaClient;
-import org.apache.hudi.common.table.HoodieTableVersion;
-import org.apache.hudi.common.util.FileIOUtils;
-import org.apache.hudi.config.HoodieWriteConfig;
-
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.log4j.LogManager;
-import org.apache.log4j.Logger;
-
-import java.io.IOException;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Properties;
-
-/**
- * Helper class to assist in upgrading/downgrading Hoodie when there is a version change.
- */
-public abstract class AbstractUpgradeDowngrade {
-
-  private static final Logger LOG = LogManager.getLogger(AbstractUpgradeDowngrade.class);
-  public static final String HOODIE_UPDATED_PROPERTY_FILE = "hoodie.properties.updated";
-
-  private HoodieTableMetaClient metaClient;
-  protected HoodieWriteConfig config;
-  protected HoodieEngineContext context;
-  private transient FileSystem fs;
-  private Path updatedPropsFilePath;
-  private Path propsFilePath;
-
-  /**
-   * Perform Upgrade or Downgrade steps if required and updated table version if need be.
-   * 

- * Starting from version 0.6.0, this upgrade/downgrade step will be added in all write paths. - * - * Essentially, if a dataset was created using any pre 0.6.0(for eg 0.5.3), and Hoodie version was upgraded to 0.6.0, - * Hoodie table version gets bumped to 1 and there are some upgrade steps need to be executed before doing any writes. - * Similarly, if a dataset was created using Hoodie version 0.6.0 or Hoodie table version 1 and then hoodie was downgraded - * to pre 0.6.0 or to Hoodie table version 0, then some downgrade steps need to be executed before proceeding w/ any writes. - * - * On a high level, these are the steps performed - * - * Step1 : Understand current hoodie table version and table version from hoodie.properties file - * Step2 : Delete any left over .updated from previous upgrade/downgrade - * Step3 : If version are different, perform upgrade/downgrade. - * Step4 : Copy hoodie.properties -> hoodie.properties.updated with the version updated - * Step6 : Rename hoodie.properties.updated to hoodie.properties - *

- * - * @param metaClient instance of {@link HoodieTableMetaClient} to use - * @param toVersion version to which upgrade or downgrade has to be done. - * @param config instance of {@link HoodieWriteConfig} to use. - * @param context instance of {@link HoodieEngineContext} to use. - * @param instantTime current instant time that should not be touched. - */ - public abstract void run(HoodieTableMetaClient metaClient, HoodieTableVersion toVersion, HoodieWriteConfig config, - HoodieEngineContext context, String instantTime); - - public boolean needsUpgradeOrDowngrade(HoodieTableVersion toVersion) { - HoodieTableVersion fromVersion = metaClient.getTableConfig().getTableVersion(); - // Ensure no inflight commits & versions are same - return toVersion.versionCode() != fromVersion.versionCode(); - } - - protected AbstractUpgradeDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context) { - this.metaClient = metaClient; - this.config = config; - this.context = context; - this.fs = metaClient.getFs(); - this.updatedPropsFilePath = new Path(metaClient.getMetaPath(), HOODIE_UPDATED_PROPERTY_FILE); - this.propsFilePath = new Path(metaClient.getMetaPath(), HoodieTableConfig.HOODIE_PROPERTIES_FILE); - } - - protected void run(HoodieTableVersion toVersion, String instantTime) throws IOException { - // Fetch version from property file and current version - HoodieTableVersion fromVersion = metaClient.getTableConfig().getTableVersion(); - if (!needsUpgradeOrDowngrade(toVersion)) { - return; - } - - if (fs.exists(updatedPropsFilePath)) { - // this can be left over .updated file from a failed attempt before. Many cases exist here. - // a) We failed while writing the .updated file and it's content is partial (e.g hdfs) - // b) We failed without renaming the file to hoodie.properties. We will re-attempt everything now anyway - // c) rename() is not atomic in cloud stores. so hoodie.properties is fine, but we failed before deleting the .updated file - // All cases, it simply suffices to delete the file and proceed. - LOG.info("Deleting existing .updated file with content :" + FileIOUtils.readAsUTFString(fs.open(updatedPropsFilePath))); - fs.delete(updatedPropsFilePath, false); - } - - // Perform the actual upgrade/downgrade; this has to be idempotent, for now. - LOG.info("Attempting to move table from version " + fromVersion + " to " + toVersion); - Map tableProps = new HashMap<>(); - if (fromVersion.versionCode() < toVersion.versionCode()) { - // upgrade - while (fromVersion.versionCode() < toVersion.versionCode()) { - HoodieTableVersion nextVersion = HoodieTableVersion.versionFromCode(fromVersion.versionCode() + 1); - tableProps.putAll(upgrade(fromVersion, nextVersion, instantTime)); - fromVersion = nextVersion; - } - } else { - // downgrade - while (fromVersion.versionCode() > toVersion.versionCode()) { - HoodieTableVersion prevVersion = HoodieTableVersion.versionFromCode(fromVersion.versionCode() - 1); - tableProps.putAll(downgrade(fromVersion, prevVersion, instantTime)); - fromVersion = prevVersion; - } - } - - // Write out the current version in hoodie.properties.updated file - for (Map.Entry entry: tableProps.entrySet()) { - metaClient.getTableConfig().setValue(entry.getKey(), entry.getValue()); - } - metaClient.getTableConfig().setTableVersion(toVersion); - createUpdatedFile(metaClient.getTableConfig().getProps()); - - // because for different fs the fs.rename have different action,such as: - // a) for hdfs : if propsFilePath already exist,fs.rename will not replace propsFilePath, but just return false - // b) for localfs: if propsFilePath already exist,fs.rename will replace propsFilePath, and return ture - // c) for aliyun ossfs: if propsFilePath already exist,will throw FileAlreadyExistsException - // so we should delete the old propsFilePath. also upgrade and downgrade is Idempotent - if (fs.exists(propsFilePath)) { - fs.delete(propsFilePath, false); - } - // Rename the .updated file to hoodie.properties. This is atomic in hdfs, but not in cloud stores. - // But as long as this does not leave a partial hoodie.properties file, we are okay. - fs.rename(updatedPropsFilePath, propsFilePath); - } - - private void createUpdatedFile(Properties props) throws IOException { - try (FSDataOutputStream outputStream = fs.create(updatedPropsFilePath)) { - props.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); - } - } - - protected abstract Map upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime); - - protected abstract Map downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime); -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseUpgradeDowngradeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseUpgradeDowngradeHelper.java new file mode 100644 index 0000000000000..d3f157be954da --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseUpgradeDowngradeHelper.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +/** + * Interface for engine-specific logic needed for upgrade and downgrade actions. + */ +public interface BaseUpgradeDowngradeHelper { + /** + * @param config Write config. + * @param context {@link HoodieEngineContext} instance to use. + * @return A new Hudi table for upgrade and downgrade actions. + */ + HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context); + + /** + * @param config Write config. + * @return partition columns in String. + */ + String getPartitionColumns(HoodieWriteConfig config); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java index 7501ed5faf69c..24b9d6f5da6f4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java @@ -32,10 +32,13 @@ public interface DowngradeHandler { /** * to be invoked to downgrade hoodie table from one version to a lower version. * - * @param config instance of {@link HoodieWriteConfig} to be used. - * @param context instance of {@link HoodieEngineContext} to be used. - * @param instantTime current instant time that should not touched. + * @param config instance of {@link HoodieWriteConfig} to be used. + * @param context instance of {@link HoodieEngineContext} to be used. + * @param instantTime current instant time that should not touched. + * @param upgradeDowngradeHelper instance of {@link BaseUpgradeDowngradeHelper} to be used. * @return Map of config properties and its values to be added to table properties. */ - Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime); + Map downgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + BaseUpgradeDowngradeHelper upgradeDowngradeHelper); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseOneToTwoUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java similarity index 77% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseOneToTwoUpgradeHandler.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java index e3a14e487b134..dddd5f4ac1410 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseOneToTwoUpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java @@ -27,16 +27,19 @@ import java.util.HashMap; import java.util.Map; -public abstract class BaseOneToTwoUpgradeHandler implements UpgradeHandler { +/** + * Upgrade handle to assist in upgrading hoodie table from version 1 to 2. + */ +public class OneToTwoUpgradeHandler implements UpgradeHandler { @Override - public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { + public Map upgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { Map tablePropsToAdd = new HashMap<>(); - tablePropsToAdd.put(HoodieTableConfig.PARTITION_FIELDS, getPartitionColumns(config)); + tablePropsToAdd.put(HoodieTableConfig.PARTITION_FIELDS, upgradeDowngradeHelper.getPartitionColumns(config)); tablePropsToAdd.put(HoodieTableConfig.RECORDKEY_FIELDS, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())); tablePropsToAdd.put(HoodieTableConfig.BASE_FILE_FORMAT, config.getString(HoodieTableConfig.BASE_FILE_FORMAT)); return tablePropsToAdd; } - - abstract String getPartitionColumns(HoodieWriteConfig config); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseOneToZeroDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java similarity index 82% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseOneToZeroDowngradeHandler.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java index 5997e181229c3..e6051cf321b50 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseOneToZeroDowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java @@ -32,12 +32,17 @@ import java.util.Map; import java.util.stream.Collectors; -public abstract class BaseOneToZeroDowngradeHandler implements DowngradeHandler { +/** + * Downgrade handle to assist in downgrading hoodie table from version 1 to 0. + */ +public class OneToZeroDowngradeHandler implements DowngradeHandler { @Override - public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { + public Map downgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + HoodieTable table = upgradeDowngradeHelper.getTable(config, context); // fetch pending commit info - HoodieTable table = getTable(config, context); HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); List commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList()); for (HoodieInstant inflightInstant : commits) { @@ -47,6 +52,4 @@ public Map downgrade(HoodieWriteConfig config, HoodieEng } return Collections.EMPTY_MAP; } - - abstract HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java new file mode 100644 index 0000000000000..964859c0ae07d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; + +import java.util.Collections; +import java.util.Map; + +/** + * Downgrade handler to assist in downgrading hoodie table from version 3 to 2. + */ +public class ThreeToTwoDowngradeHandler implements DowngradeHandler { + + @Override + public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + if (config.isMetadataTableEnabled()) { + // Metadata Table in version 3 is synchronous and in version 2 is asynchronous. Downgrading to asynchronous + // removes the checks in code to decide whether to use a LogBlock or not. Also, the schema for the + // table has been updated and is not forward compatible. Hence, we need to delete the table. + HoodieTableMetadataUtil.deleteMetadataTable(config.getBasePath(), context); + } + return Collections.emptyMap(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseTwoToOneDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java similarity index 90% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseTwoToOneDowngradeHandler.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java index 621711a3ffccd..ee638a16f8633 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseTwoToOneDowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java @@ -46,12 +46,16 @@ import java.util.stream.Collectors; import static org.apache.hudi.common.util.MarkerUtils.MARKERS_FILENAME_PREFIX; - -public abstract class BaseTwoToOneDowngradeHandler implements DowngradeHandler { +/** + * Downgrade handler to assist in downgrading hoodie table from version 2 to 1. + */ +public class TwoToOneDowngradeHandler implements DowngradeHandler { @Override - public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { - HoodieTable table = getTable(config, context); + public Map downgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + HoodieTable table = upgradeDowngradeHelper.getTable(config, context); HoodieTableMetaClient metaClient = table.getMetaClient(); // re-create marker files if any partial timeline server based markers are found @@ -69,8 +73,6 @@ public Map downgrade(HoodieWriteConfig config, HoodieEng return Collections.EMPTY_MAP; } - abstract HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context); - /** * Converts the markers in new format(timeline server based) to old format of direct markers, * i.e., one marker file per data file, without MARKERS.type file. @@ -106,8 +108,7 @@ private void convertToDirectMarkers(final String commitInstantTime, // Deletes marker type file MarkerUtils.deleteMarkerTypeFile(fileSystem, markerDir); // Deletes timeline server based markers - deleteTimelineBasedMarkerFiles( - context, markerDir, fileSystem, table.getConfig().getMarkersDeleteParallelism()); + deleteTimelineBasedMarkerFiles(context, markerDir, fileSystem, parallelism); break; default: throw new HoodieException("The marker type \"" + markerTypeOption.get().name() @@ -116,8 +117,7 @@ private void convertToDirectMarkers(final String commitInstantTime, } else { // In case of partial failures during downgrade, there is a chance that marker type file was deleted, // but timeline server based marker files are left. So deletes them if any - deleteTimelineBasedMarkerFiles( - context, markerDir, fileSystem, table.getConfig().getMarkersDeleteParallelism()); + deleteTimelineBasedMarkerFiles(context, markerDir, fileSystem, parallelism); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java new file mode 100644 index 0000000000000..e1dbfbbe2a51d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; + +import java.util.HashMap; +import java.util.Map; + +/** + * UpgradeHandler to assist in upgrading {@link org.apache.hudi.table.HoodieTable} from version 2 to 3. + */ +public class TwoToThreeUpgradeHandler implements UpgradeHandler { + @Override + public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + if (config.isMetadataTableEnabled()) { + // Metadata Table in version 2 is asynchronous and in version 3 is synchronous. Synchronous table will not + // sync any instants not already synced. So its simpler to re-bootstrap the table. Also, the schema for the + // table has been updated and is not backward compatible. + HoodieTableMetadataUtil.deleteMetadataTable(config.getBasePath(), context); + } + Map tablePropsToAdd = new HashMap<>(); + tablePropsToAdd.put(HoodieTableConfig.URL_ENCODE_PARTITIONING, config.getStringOrDefault(HoodieTableConfig.URL_ENCODE_PARTITIONING)); + tablePropsToAdd.put(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE, config.getStringOrDefault(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)); + tablePropsToAdd.put(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME, config.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)); + return tablePropsToAdd; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java new file mode 100644 index 0000000000000..c5ae043d17818 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpgradeDowngradeException; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +/** + * Helper class to assist in upgrading/downgrading Hoodie when there is a version change. + */ +public class UpgradeDowngrade { + + private static final Logger LOG = LogManager.getLogger(UpgradeDowngrade.class); + public static final String HOODIE_UPDATED_PROPERTY_FILE = "hoodie.properties.updated"; + + private final BaseUpgradeDowngradeHelper upgradeDowngradeHelper; + private HoodieTableMetaClient metaClient; + protected HoodieWriteConfig config; + protected HoodieEngineContext context; + private transient FileSystem fs; + private Path updatedPropsFilePath; + private Path propsFilePath; + + public UpgradeDowngrade( + HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context, + BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + this.metaClient = metaClient; + this.config = config; + this.context = context; + this.fs = metaClient.getFs(); + this.updatedPropsFilePath = new Path(metaClient.getMetaPath(), HOODIE_UPDATED_PROPERTY_FILE); + this.propsFilePath = new Path(metaClient.getMetaPath(), HoodieTableConfig.HOODIE_PROPERTIES_FILE); + this.upgradeDowngradeHelper = upgradeDowngradeHelper; + } + + public boolean needsUpgradeOrDowngrade(HoodieTableVersion toVersion) { + HoodieTableVersion fromVersion = metaClient.getTableConfig().getTableVersion(); + // Ensure versions are same + return toVersion.versionCode() != fromVersion.versionCode(); + } + + /** + * Perform Upgrade or Downgrade steps if required and updated table version if need be. + *

+ * Starting from version 0.6.0, this upgrade/downgrade step will be added in all write paths. + *

+ * Essentially, if a dataset was created using an previous table version in an older release, + * and Hoodie version was upgraded to a new release with new table version supported, + * Hoodie table version gets bumped to the new version and there are some upgrade steps need + * to be executed before doing any writes. + *

+ * Similarly, if a dataset was created using an newer table version in an newer release, + * and then hoodie was downgraded to an older release or to older Hoodie table version, + * then some downgrade steps need to be executed before proceeding w/ any writes. + *

+ * Below shows the table version corresponding to the Hudi release: + * Hudi release -> table version + * pre 0.6.0 -> v0 + * 0.6.0 to 0.8.0 -> v1 + * 0.9.0 -> v2 + * 0.10.0 to current -> v3 + *

+ * On a high level, these are the steps performed + *

+ * Step1 : Understand current hoodie table version and table version from hoodie.properties file + * Step2 : Delete any left over .updated from previous upgrade/downgrade + * Step3 : If version are different, perform upgrade/downgrade. + * Step4 : Copy hoodie.properties -> hoodie.properties.updated with the version updated + * Step6 : Rename hoodie.properties.updated to hoodie.properties + *

+ * + * @param toVersion version to which upgrade or downgrade has to be done. + * @param instantTime current instant time that should not be touched. + */ + public void run(HoodieTableVersion toVersion, String instantTime) { + try { + // Fetch version from property file and current version + HoodieTableVersion fromVersion = metaClient.getTableConfig().getTableVersion(); + if (!needsUpgradeOrDowngrade(toVersion)) { + return; + } + + if (fs.exists(updatedPropsFilePath)) { + // this can be left over .updated file from a failed attempt before. Many cases exist here. + // a) We failed while writing the .updated file and it's content is partial (e.g hdfs) + // b) We failed without renaming the file to hoodie.properties. We will re-attempt everything now anyway + // c) rename() is not atomic in cloud stores. so hoodie.properties is fine, but we failed before deleting the .updated file + // All cases, it simply suffices to delete the file and proceed. + LOG.info("Deleting existing .updated file with content :" + FileIOUtils.readAsUTFString(fs.open(updatedPropsFilePath))); + fs.delete(updatedPropsFilePath, false); + } + + // Perform the actual upgrade/downgrade; this has to be idempotent, for now. + LOG.info("Attempting to move table from version " + fromVersion + " to " + toVersion); + Map tableProps = new HashMap<>(); + if (fromVersion.versionCode() < toVersion.versionCode()) { + // upgrade + while (fromVersion.versionCode() < toVersion.versionCode()) { + HoodieTableVersion nextVersion = HoodieTableVersion.versionFromCode(fromVersion.versionCode() + 1); + tableProps.putAll(upgrade(fromVersion, nextVersion, instantTime)); + fromVersion = nextVersion; + } + } else { + // downgrade + while (fromVersion.versionCode() > toVersion.versionCode()) { + HoodieTableVersion prevVersion = HoodieTableVersion.versionFromCode(fromVersion.versionCode() - 1); + tableProps.putAll(downgrade(fromVersion, prevVersion, instantTime)); + fromVersion = prevVersion; + } + } + + // Write out the current version in hoodie.properties.updated file + for (Map.Entry entry : tableProps.entrySet()) { + metaClient.getTableConfig().setValue(entry.getKey(), entry.getValue()); + } + metaClient.getTableConfig().setTableVersion(toVersion); + createUpdatedFile(metaClient.getTableConfig().getProps()); + + // because for different fs the fs.rename have different action,such as: + // a) for hdfs : if propsFilePath already exist,fs.rename will not replace propsFilePath, but just return false + // b) for localfs: if propsFilePath already exist,fs.rename will replace propsFilePath, and return ture + // c) for aliyun ossfs: if propsFilePath already exist,will throw FileAlreadyExistsException + // so we should delete the old propsFilePath. also upgrade and downgrade is Idempotent + if (fs.exists(propsFilePath)) { + fs.delete(propsFilePath, false); + } + // Rename the .updated file to hoodie.properties. This is atomic in hdfs, but not in cloud stores. + // But as long as this does not leave a partial hoodie.properties file, we are okay. + fs.rename(updatedPropsFilePath, propsFilePath); + } catch (IOException e) { + throw new HoodieUpgradeDowngradeException("Error during upgrade/downgrade to version:" + toVersion, e); + } + } + + private void createUpdatedFile(Properties props) throws IOException { + try (FSDataOutputStream outputStream = fs.create(updatedPropsFilePath)) { + props.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); + } + } + + protected Map upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { + if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) { + return new ZeroToOneUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.TWO) { + return new OneToTwoUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.TWO && toVersion == HoodieTableVersion.THREE) { + return new TwoToThreeUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else { + throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true); + } + } + + protected Map downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { + if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.ZERO) { + return new OneToZeroDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.TWO && toVersion == HoodieTableVersion.ONE) { + return new TwoToOneDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.THREE && toVersion == HoodieTableVersion.TWO) { + return new ThreeToTwoDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else { + throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java index 8ca6f0e86beb6..9dc477ffc9dc6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java @@ -32,10 +32,13 @@ public interface UpgradeHandler { /** * to be invoked to upgrade hoodie table from one version to a higher version. * - * @param config instance of {@link HoodieWriteConfig} to be used. - * @param context instance of {@link HoodieEngineContext} to be used. - * @param instantTime current instant time that should not be touched. + * @param config instance of {@link HoodieWriteConfig} to be used. + * @param context instance of {@link HoodieEngineContext} to be used. + * @param instantTime current instant time that should not be touched. + * @param upgradeDowngradeHelper instance of {@link BaseUpgradeDowngradeHelper} to be used. * @return Map of config properties and its values to be added to table properties. */ - Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime); + Map upgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + BaseUpgradeDowngradeHelper upgradeDowngradeHelper); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseZeroToOneUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java similarity index 79% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseZeroToOneUpgradeHandler.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java index f0e3e4f1eb6f7..18815b2e132da 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseZeroToOneUpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.upgrade; +import org.apache.hudi.avro.model.HoodieRollbackRequest; import org.apache.hudi.common.HoodieRollbackStat; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -33,6 +34,8 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.rollback.BaseRollbackHelper; +import org.apache.hudi.table.action.rollback.ListingBasedRollbackHelper; import org.apache.hudi.table.action.rollback.ListingBasedRollbackRequest; import org.apache.hudi.table.action.rollback.RollbackUtils; import org.apache.hudi.table.marker.WriteMarkers; @@ -46,13 +49,17 @@ import java.util.Map; import java.util.stream.Collectors; -public abstract class BaseZeroToOneUpgradeHandler implements UpgradeHandler { +/** + * Upgrade handle to assist in upgrading hoodie table from version 0 to 1. + */ +public class ZeroToOneUpgradeHandler implements UpgradeHandler { @Override - public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { + public Map upgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { // fetch pending commit info - //HoodieSparkTable table = HoodieSparkTable.create(config, context); - HoodieTable table = getTable(config, context); + HoodieTable table = upgradeDowngradeHelper.getTable(config, context); HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); List commits = inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp) .collect(Collectors.toList()); @@ -67,8 +74,6 @@ public Map upgrade(HoodieWriteConfig config, HoodieEngin return Collections.EMPTY_MAP; } - abstract HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context); - /** * Recreate markers in new format. * Step1: Delete existing markers @@ -76,14 +81,14 @@ public Map upgrade(HoodieWriteConfig config, HoodieEngin * Step3: recreate markers for all interested files. * * @param commitInstantTime instant of interest for which markers need to be recreated. - * @param table instance of {@link HoodieTable} to use - * @param context instance of {@link HoodieEngineContext} to use + * @param table instance of {@link HoodieTable} to use + * @param context instance of {@link HoodieEngineContext} to use * @throws HoodieRollbackException on any exception during upgrade. */ protected void recreateMarkers(final String commitInstantTime, - HoodieTable table, - HoodieEngineContext context, - int parallelism) throws HoodieRollbackException { + HoodieTable table, + HoodieEngineContext context, + int parallelism) throws HoodieRollbackException { try { // fetch hoodie instant Option commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants() @@ -121,9 +126,13 @@ protected void recreateMarkers(final String commitInstantTime, } } - abstract List getListBasedRollBackStats(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - HoodieEngineContext context, Option commitInstantOpt, - List rollbackRequests); + List getListBasedRollBackStats( + HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context, + Option commitInstantOpt, List rollbackRequests) { + List hoodieRollbackRequests = new ListingBasedRollbackHelper(metaClient, config) + .getRollbackRequestsForRollbackPlan(context, commitInstantOpt.get(), rollbackRequests); + return new BaseRollbackHelper(metaClient, config).collectRollbackStats(context, commitInstantOpt.get(), hoodieRollbackRequests); + } /** * Curates file name for marker from existing log file path. @@ -131,6 +140,7 @@ abstract List getListBasedRollBackStats(HoodieTableMetaClien * marker file format : partitionpath/fileId_writetoken_baseinstant.basefileExtn.marker.APPEND * * @param logFilePath log file path for which marker file name needs to be generated. + * @param table {@link HoodieTable} instance to use * @return the marker file name thus curated. */ private static String getFileNameForMarkerFromLogFile(String logFilePath, HoodieTable table) { diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/FileSystemBasedLockProviderTestClass.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/FileSystemBasedLockProviderTestClass.java index c6a1527e292b0..702a84a97b300 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/FileSystemBasedLockProviderTestClass.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/FileSystemBasedLockProviderTestClass.java @@ -28,6 +28,7 @@ import org.apache.hudi.exception.HoodieLockException; import java.io.IOException; +import java.io.Serializable; import java.util.concurrent.TimeUnit; import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; @@ -39,12 +40,12 @@ * create operation. This lock does not support cleaning/expiring the lock after a failed write hence cannot be used * in production environments. */ -public class FileSystemBasedLockProviderTestClass implements LockProvider { +public class FileSystemBasedLockProviderTestClass implements LockProvider, Serializable { private static final String LOCK_NAME = "acquired"; private String lockPath; - private FileSystem fs; + private transient FileSystem fs; protected LockConfiguration lockConfiguration; public FileSystemBasedLockProviderTestClass(final LockConfiguration lockConfiguration, final Configuration configuration) { @@ -55,7 +56,7 @@ public FileSystemBasedLockProviderTestClass(final LockConfiguration lockConfigur public void acquireLock() { try { - fs.create(new Path(lockPath + "/" + LOCK_NAME)).close(); + fs.create(new Path(lockPath + "/" + LOCK_NAME), false).close(); } catch (IOException e) { throw new HoodieIOException("Failed to acquire lock", e); } @@ -78,7 +79,12 @@ public boolean tryLock(long time, TimeUnit unit) { && (numRetries <= lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY))) { Thread.sleep(lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY)); } - acquireLock(); + synchronized (LOCK_NAME) { + if (fs.exists(new Path(lockPath + "/" + LOCK_NAME))) { + return false; + } + acquireLock(); + } return true; } catch (IOException | InterruptedException e) { throw new HoodieLockException("Failed to acquire lock", e); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestSimpleConcurrentFileWritesConflictResolutionStrategy.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestSimpleConcurrentFileWritesConflictResolutionStrategy.java index abe2a945628a4..afe8e05aa0662 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestSimpleConcurrentFileWritesConflictResolutionStrategy.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestSimpleConcurrentFileWritesConflictResolutionStrategy.java @@ -314,7 +314,7 @@ private void createCommit(String instantTime) throws Exception { commitMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); commitMetadata.setOperationType(WriteOperationType.INSERT); HoodieTestTable.of(metaClient) - .addCommit(instantTime, commitMetadata) + .addCommit(instantTime, Option.of(commitMetadata)) .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); } @@ -362,7 +362,7 @@ private void createCompaction(String instantTime) throws Exception { writeStat.setFileId("file-1"); commitMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); HoodieTestTable.of(metaClient) - .addCommit(instantTime, commitMetadata) + .addCommit(instantTime, Option.of(commitMetadata)) .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java new file mode 100644 index 0000000000000..fa0f5df61b183 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.testutils; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; + +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * {@link HoodieTestTable} impl used for testing metadata. This class does synchronous updates to HoodieTableMetadataWriter if non null. + */ +public class HoodieMetadataTestTable extends HoodieTestTable { + + private final HoodieTableMetadataWriter writer; + + protected HoodieMetadataTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, HoodieTableMetadataWriter writer) { + super(basePath, fs, metaClient); + this.writer = writer; + } + + public static HoodieTestTable of(HoodieTableMetaClient metaClient) { + return HoodieMetadataTestTable.of(metaClient, null); + } + + public static HoodieTestTable of(HoodieTableMetaClient metaClient, HoodieTableMetadataWriter writer) { + testTableState = HoodieTestTableState.of(); + return new HoodieMetadataTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, writer); + } + + /** + * Add commits to the requested partitions and update metadata table. + * + * @param commitTime - Commit time for the operation + * @param operationType - Operation type + * @param newPartitionsToAdd - New partitions to add for the operation + * @param partitionToFilesNameLengthMap - Map of partition names to its list of files name and length pair + * @param bootstrap - Whether bootstrapping needed for the operation + * @param createInflightCommit - Whether in flight commit needed for the operation + * @return Commit metadata for the commit operation performed. + * @throws Exception + */ + @Override + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, + Map>> partitionToFilesNameLengthMap, + boolean bootstrap, boolean createInflightCommit) throws Exception { + HoodieCommitMetadata commitMetadata = super.doWriteOperation(commitTime, operationType, newPartitionsToAdd, + partitionToFilesNameLengthMap, bootstrap, createInflightCommit); + if (writer != null && !createInflightCommit) { + writer.update(commitMetadata, commitTime, false); + } + return commitMetadata; + } + + @Override + public HoodieTestTable moveInflightCommitToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + super.moveInflightCommitToComplete(instantTime, metadata); + if (writer != null) { + writer.update(metadata, instantTime, false); + } + return this; + } + + public HoodieTestTable moveInflightCommitToComplete(String instantTime, HoodieCommitMetadata metadata, boolean ignoreWriter) throws IOException { + super.moveInflightCommitToComplete(instantTime, metadata); + if (!ignoreWriter && writer != null) { + writer.update(metadata, instantTime, false); + } + return this; + } + + @Override + public HoodieTestTable moveInflightCompactionToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + super.moveInflightCompactionToComplete(instantTime, metadata); + if (writer != null) { + writer.update(metadata, instantTime, true); + } + return this; + } + + @Override + public HoodieCleanMetadata doClean(String commitTime, Map partitionFileCountsToDelete) throws IOException { + HoodieCleanMetadata cleanMetadata = super.doClean(commitTime, partitionFileCountsToDelete); + if (writer != null) { + writer.update(cleanMetadata, commitTime); + } + return cleanMetadata; + } + + public HoodieTestTable addCompaction(String instantTime, HoodieCommitMetadata commitMetadata) throws Exception { + super.addCompaction(instantTime, commitMetadata); + if (writer != null) { + writer.update(commitMetadata, instantTime, true); + } + return this; + } + + @Override + public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata rollbackMetadata) throws IOException { + super.addRollback(instantTime, rollbackMetadata); + if (writer != null) { + writer.update(rollbackMetadata, instantTime); + } + return this; + } + + @Override + public HoodieTestTable addRestore(String instantTime, HoodieRestoreMetadata restoreMetadata) throws IOException { + super.addRestore(instantTime, restoreMetadata); + if (writer != null) { + writer.update(restoreMetadata, instantTime); + } + return this; + } + + @Override + public HoodieTestTable addReplaceCommit( + String instantTime, + Option requestedReplaceMetadata, + Option inflightReplaceMetadata, + HoodieReplaceCommitMetadata completeReplaceMetadata) throws Exception { + super.addReplaceCommit(instantTime, requestedReplaceMetadata, inflightReplaceMetadata, completeReplaceMetadata); + if (writer != null) { + writer.update(completeReplaceMetadata, instantTime, true); + } + return this; + } + +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java new file mode 100644 index 0000000000000..04920635684a9 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.engine.TaskContextSupplier; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.compress.Compression; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; +import org.apache.hadoop.hbase.util.Pair; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mockito; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM; +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHoodieHFileReaderWriter { + @TempDir File tempDir; + private Path filePath; + + @BeforeEach + public void setup() throws IOException { + filePath = new Path(tempDir.toString() + "tempFile.txt"); + } + + @AfterEach + public void clearTempFile() { + File file = new File(filePath.toString()); + if (file.exists()) { + file.delete(); + } + } + + private HoodieHFileWriter createHFileWriter(Schema avroSchema) throws Exception { + BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name()); + Configuration conf = new Configuration(); + TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class); + String instantTime = "000"; + + HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024, + filter); + return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier); + } + + @Test + public void testWriteReadHFile() throws Exception { + Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc"); + HoodieHFileWriter writer = createHFileWriter(avroSchema); + List keys = new ArrayList<>(); + Map recordMap = new HashMap<>(); + for (int i = 0; i < 100; i++) { + GenericRecord record = new GenericData.Record(avroSchema); + String key = String.format("%s%04d", "key", i); + record.put("_row_key", key); + keys.add(key); + record.put("time", Integer.toString(RANDOM.nextInt())); + record.put("number", i); + writer.writeAvro(key, record); + recordMap.put(key, record); + } + writer.close(); + + Configuration conf = new Configuration(); + CacheConfig cacheConfig = new CacheConfig(conf); + HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf)); + List> records = hoodieHFileReader.readAllRecords(); + records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst()))); + hoodieHFileReader.close(); + + for (int i = 0; i < 20; i++) { + int randomRowstoFetch = 5 + RANDOM.nextInt(50); + Set rowsToFetch = getRandomKeys(randomRowstoFetch, keys); + List rowsList = new ArrayList<>(rowsToFetch); + Collections.sort(rowsList); + hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf)); + List> result = hoodieHFileReader.readRecords(rowsList); + assertEquals(result.size(), randomRowstoFetch); + result.forEach(entry -> { + assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())); + }); + hoodieHFileReader.close(); + } + } + + private Set getRandomKeys(int count, List keys) { + Set rowKeys = new HashSet<>(); + int totalKeys = keys.size(); + while (rowKeys.size() < count) { + int index = RANDOM.nextInt(totalKeys); + if (!rowKeys.contains(index)) { + rowKeys.add(keys.get(index)); + } + } + return rowKeys; + } +} \ No newline at end of file diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieGraphiteMetrics.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieGraphiteMetrics.java new file mode 100644 index 0000000000000..6ff7ee88ac8fb --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieGraphiteMetrics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics; + +import org.apache.hudi.common.testutils.NetworkTestUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import static org.apache.hudi.metrics.Metrics.registerGauge; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.when; + +/** + * Test for the Graphite metrics report. + */ +@ExtendWith(MockitoExtension.class) +public class TestHoodieGraphiteMetrics { + + @Mock + HoodieWriteConfig config; + + @AfterEach + void shutdownMetrics() { + Metrics.shutdown(); + } + + @Test + public void testRegisterGauge() { + when(config.isMetricsOn()).thenReturn(true); + when(config.getTableName()).thenReturn("table1"); + when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.GRAPHITE); + when(config.getGraphiteServerHost()).thenReturn("localhost"); + when(config.getGraphiteServerPort()).thenReturn(NetworkTestUtils.nextFreePort()); + when(config.getGraphiteReportPeriodSeconds()).thenReturn(30); + new HoodieMetrics(config); + registerGauge("graphite_metric", 123L); + assertEquals("123", Metrics.getInstance().getRegistry().getGauges() + .get("graphite_metric").getValue().toString()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestHoodieMetricsDatadogConfig.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestHoodieMetricsDatadogConfig.java index 016e070d7fbce..aa486e9b95245 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestHoodieMetricsDatadogConfig.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestHoodieMetricsDatadogConfig.java @@ -19,7 +19,7 @@ package org.apache.hudi.metrics.datadog; -import org.apache.hudi.config.HoodieMetricsDatadogConfig; +import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.junit.jupiter.api.Test; diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java new file mode 100644 index 0000000000000..7dab6c2057c77 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestZOrderingUtil.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.optimize; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestZOrderingUtil { + + @Test + public void testIntConvert() { + // test Int + int[] testInt = new int[] {-1, 1, -2, 10000, -100000, 2, Integer.MAX_VALUE, Integer.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testInt.length; i++) { + valueWrappers.add(new OrginValueWrapper<>(i, testInt[i])); + convertResultWrappers.add(new ConvertResultWrapper<>(i, ZOrderingUtil.intTo8Byte(testInt[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testInt.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testLongConvert() { + // test Long + long[] testLong = new long[] {-1L, 1L, -2L, 10000L, -100000L, 2L, Long.MAX_VALUE, Long.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testLong.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((long)i, testLong[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((long)i, ZOrderingUtil.longTo8Byte(testLong[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testLong.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testDoubleConvert() { + // test Long + double[] testDouble = new double[] {-1.00d, 1.05d, -2.3d, 10000.002d, -100000.7d, 2.9d, Double.MAX_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testDouble.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((Double)(i * 1.0), testDouble[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((Double)(i * 1.0), ZOrderingUtil.doubleTo8Byte(testDouble[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testDouble.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testFloatConvert() { + // test Long + float[] testDouble = new float[] {-1.00f, 1.05f, -2.3f, 10000.002f, -100000.7f, 2.9f, Float.MAX_VALUE, Float.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testDouble.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((float)(i * 1.0), testDouble[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((float)(i * 1.0), ZOrderingUtil.doubleTo8Byte((double) testDouble[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testDouble.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + private class ConvertResultWrapper { + T index; + byte[] result; + public ConvertResultWrapper(T index, byte[] result) { + this.index = index; + this.result = result; + } + } + + private class OrginValueWrapper { + T index; + T originValue; + public OrginValueWrapper(T index, T originValue) { + this.index = index; + this.originValue = originValue; + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java index eb2ebd95128e0..c0952bc5a7204 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java @@ -176,7 +176,7 @@ private void createCompactionMetadata(String instantTime) throws Exception { commitMetadata.setOperationType(WriteOperationType.COMPACT); commitMetadata.setCompacted(true); HoodieTestTable.of(metaClient) - .addCommit(instantTime, commitMetadata) + .addCommit(instantTime, Option.of(commitMetadata)) .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); } @@ -206,7 +206,7 @@ private void createCommitMetadata(String instantTime) throws Exception { commitMetadata.addMetadata("test", "test"); commitMetadata.setOperationType(WriteOperationType.INSERT); HoodieTestTable.of(metaClient) - .addCommit(instantTime, commitMetadata) + .addCommit(instantTime, Option.of(commitMetadata)) .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java index aab248fc3cf16..10c5ac5d6b00c 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java @@ -61,5 +61,5 @@ public Option getProperty(EngineProperty prop) { // no operation for now return Option.empty(); } - + } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java index fdefd9050ee46..3d44a2432f362 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java @@ -19,7 +19,7 @@ package org.apache.hudi.client; import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; @@ -36,32 +36,33 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.index.FlinkHoodieIndex; +import org.apache.hudi.index.FlinkHoodieIndexFactory; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.io.FlinkAppendHandle; +import org.apache.hudi.io.FlinkConcatAndReplaceHandle; +import org.apache.hudi.io.FlinkConcatHandle; import org.apache.hudi.io.FlinkCreateHandle; import org.apache.hudi.io.FlinkMergeAndReplaceHandle; import org.apache.hudi.io.FlinkMergeHandle; import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.io.MiniBatchHandle; import org.apache.hudi.metadata.FlinkHoodieBackedTableMetadataWriter; -import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieFlinkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTimelineArchiveLog; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.compact.FlinkCompactHelpers; +import org.apache.hudi.table.action.compact.CompactHelpers; import org.apache.hudi.table.marker.WriteMarkersFactory; -import org.apache.hudi.table.upgrade.FlinkUpgradeDowngrade; +import org.apache.hudi.table.upgrade.FlinkUpgradeDowngradeHelper; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; import org.apache.hudi.util.FlinkClientUtil; import com.codahale.metrics.Timer; @@ -71,7 +72,6 @@ import java.io.IOException; import java.text.ParseException; -import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -88,30 +88,24 @@ public class HoodieFlinkWriteClient extends * FileID to write handle mapping in order to record the write handles for each file group, * so that we can append the mini-batch data buffer incrementally. */ - private Map> bucketToHandles; + private final Map> bucketToHandles; - public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { - this(context, clientConfig, false); - } + /** + * Cached metadata writer for coordinator to reuse for each commit. + */ + private Option metadataWriterOption = Option.empty(); - @Deprecated - public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending) { + public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig) { super(context, writeConfig); this.bucketToHandles = new HashMap<>(); } - @Deprecated - public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending, - Option timelineService) { - super(context, writeConfig, timelineService); - } - /** * Complete changes performed at the given instantTime marker with specified action. */ @Override - protected HoodieIndex>, List, List> createIndex(HoodieWriteConfig writeConfig) { - return FlinkHoodieIndex.createIndex((HoodieFlinkEngineContext) context, config); + protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) { + return FlinkHoodieIndexFactory.createIndex((HoodieFlinkEngineContext) context, config); } @Override @@ -121,7 +115,8 @@ public boolean commit(String instantTime, List writeStatuses, Optio } @Override - protected HoodieTable>, List, List> createTable(HoodieWriteConfig config, Configuration hadoopConf) { + protected HoodieTable>, List, List> createTable(HoodieWriteConfig config, Configuration hadoopConf, + boolean refreshTimeline) { return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); } @@ -130,7 +125,8 @@ public List> filterExists(List> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible HoodieFlinkTable table = getHoodieTable(); Timer.Context indexTimer = metrics.getIndexCtx(); - List> recordsWithLocation = getIndex().tagLocation(hoodieRecords, context, table); + List> recordsWithLocation = HoodieList.getList( + getIndex().tagLocation(HoodieList.of(hoodieRecords), context, table)); metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); return recordsWithLocation.stream().filter(v1 -> !v1.isCurrentLocationKnown()).collect(Collectors.toList()); } @@ -261,6 +257,24 @@ protected void preWrite(String instantTime, WriteOperationType writeOperationTyp // remove the async cleaning } + @Override + protected void preCommit(HoodieInstant inflightInstant, HoodieCommitMetadata metadata) { + this.metadataWriterOption.ifPresent(w -> { + w.initTableMetadata(); // refresh the timeline + w.update(metadata, inflightInstant.getTimestamp(), getHoodieTable().isTableServiceAction(inflightInstant.getAction())); + }); + } + + /** + * Initialize the table metadata writer, for e.g, bootstrap the metadata table + * from the filesystem if it does not exist. + */ + public void initMetadataWriter() { + HoodieBackedTableMetadataWriter metadataWriter = (HoodieBackedTableMetadataWriter) FlinkHoodieBackedTableMetadataWriter.create( + FlinkClientUtil.getHadoopConf(), this.config, HoodieFlinkEngineContext.DEFAULT); + this.metadataWriterOption = Option.of(metadataWriter); + } + /** * Starts async cleaning service for finished commits. * @@ -334,8 +348,8 @@ public void commitCompaction( List writeStatuses, Option> extraMetadata) throws IOException { HoodieFlinkTable table = getHoodieTable(); - HoodieCommitMetadata metadata = FlinkCompactHelpers.newInstance().createCompactionMetadata( - table, compactionInstantTime, writeStatuses, config.getSchema()); + HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata( + table, compactionInstantTime, HoodieList.of(writeStatuses), config.getSchema()); extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata)); completeCompaction(metadata, writeStatuses, table, compactionInstantTime); } @@ -348,14 +362,16 @@ public void completeCompaction( String compactionCommitTime) { this.context.setJobStatus(this.getClass().getSimpleName(), "Collect compaction write status and commit compaction"); List writeStats = writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()); + writeTableMetadata(table, metadata, new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionCommitTime)); + // commit to data table after committing to metadata table. finalizeWrite(table, compactionCommitTime, writeStats); LOG.info("Committing Compaction {} finished with result {}.", compactionCommitTime, metadata); - FlinkCompactHelpers.newInstance().completeInflightCompaction(table, compactionCommitTime, metadata); + CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata); if (compactionTimer != null) { long durationInMs = metrics.getDurationInMs(compactionTimer.stop()); try { - metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(), + metrics.updateCommitMetrics(HoodieActiveTimeline.parseInstantTime(compactionCommitTime).getTime(), durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION); } catch (ParseException e) { throw new HoodieCommitException("Commit time is not of valid format. Failed to commit compaction " @@ -369,7 +385,8 @@ public void completeCompaction( protected List compact(String compactionInstantTime, boolean shouldComplete) { // only used for metadata table, the compaction happens in single thread try { - List writeStatuses = FlinkCompactHelpers.compact(compactionInstantTime, this); + List writeStatuses = + getHoodieTable().compact(context, compactionInstantTime).getWriteStatuses(); commitCompaction(compactionInstantTime, writeStatuses, Option.empty()); return writeStatuses; } catch (IOException e) { @@ -382,23 +399,27 @@ public HoodieWriteMetadata> cluster(final String clusteringIns throw new HoodieNotSupportedException("Clustering is not supported yet"); } + private void writeTableMetadata(HoodieTable>, List, List> table, + HoodieCommitMetadata commitMetadata, + HoodieInstant hoodieInstant) { + try { + this.txnManager.beginTransaction(Option.of(hoodieInstant), Option.empty()); + // Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a + // single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition. + table.getMetadataWriter().ifPresent(w -> w.update(commitMetadata, hoodieInstant.getTimestamp(), table.isTableServiceAction(hoodieInstant.getAction()))); + } finally { + this.txnManager.endTransaction(); + } + } + @Override protected HoodieTable>, List, List> getTableAndInitCtx(WriteOperationType operationType, String instantTime) { HoodieTableMetaClient metaClient = createMetaClient(true); - new FlinkUpgradeDowngrade(metaClient, config, context).run(metaClient, HoodieTableVersion.current(), config, context, instantTime); + new UpgradeDowngrade(metaClient, config, context, FlinkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.current(), instantTime); return getTableAndInitCtx(metaClient, operationType); } - @Override - public void syncTableMetadata() { - // Open up the metadata table again, for syncing - try (HoodieTableMetadataWriter writer = FlinkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context)) { - LOG.info("Successfully synced to metadata table"); - } catch (Exception e) { - throw new HoodieMetadataException("Error syncing to metadata table.", e); - } - } - /** * Upgrade downgrade the Hoodie table. * @@ -407,7 +428,8 @@ public void syncTableMetadata() { */ public void upgradeDowngrade(String instantTime) { HoodieTableMetaClient metaClient = createMetaClient(true); - new FlinkUpgradeDowngrade(metaClient, config, context).run(metaClient, HoodieTableVersion.current(), config, context, instantTime); + new UpgradeDowngrade(metaClient, config, context, FlinkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.current(), instantTime); } /** @@ -448,13 +470,16 @@ public void cleanHandlesGracefully() { final HoodieRecordLocation loc = record.getCurrentLocation(); final String fileID = loc.getFileId(); final String partitionPath = record.getPartitionPath(); + final boolean insertClustering = config.allowDuplicateInserts(); if (bucketToHandles.containsKey(fileID)) { MiniBatchHandle lastHandle = (MiniBatchHandle) bucketToHandles.get(fileID); if (lastHandle.shouldReplace()) { - HoodieWriteHandle writeHandle = new FlinkMergeAndReplaceHandle<>( - config, instantTime, table, recordItr, partitionPath, fileID, table.getTaskContextSupplier(), - lastHandle.getWritePath()); + HoodieWriteHandle writeHandle = insertClustering + ? new FlinkConcatAndReplaceHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, + table.getTaskContextSupplier(), lastHandle.getWritePath()) + : new FlinkMergeAndReplaceHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, + table.getTaskContextSupplier(), lastHandle.getWritePath()); this.bucketToHandles.put(fileID, writeHandle); // override with new replace handle return writeHandle; } @@ -469,8 +494,11 @@ public void cleanHandlesGracefully() { writeHandle = new FlinkCreateHandle<>(config, instantTime, table, partitionPath, fileID, table.getTaskContextSupplier()); } else { - writeHandle = new FlinkMergeHandle<>(config, instantTime, table, recordItr, partitionPath, - fileID, table.getTaskContextSupplier()); + writeHandle = insertClustering + ? new FlinkConcatHandle<>(config, instantTime, table, recordItr, partitionPath, + fileID, table.getTaskContextSupplier()) + : new FlinkMergeHandle<>(config, instantTime, table, recordItr, partitionPath, + fileID, table.getTaskContextSupplier()); } this.bucketToHandles.put(fileID, writeHandle); return writeHandle; @@ -490,41 +518,6 @@ private HoodieTable>, List, List return table; } - public String getLastPendingInstant(HoodieTableType tableType) { - final String actionType = CommitUtils.getCommitActionType(tableType); - return getLastPendingInstant(actionType); - } - - public String getLastPendingInstant(String actionType) { - HoodieTimeline unCompletedTimeline = FlinkClientUtil.createMetaClient(basePath) - .getCommitsTimeline().filterInflightsAndRequested(); - return unCompletedTimeline.getInstants() - .filter(x -> x.getAction().equals(actionType) && x.isInflight()) - .map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()).stream() - .max(Comparator.naturalOrder()) - .orElse(null); - } - - public String getLastCompletedInstant(HoodieTableType tableType) { - final String commitType = CommitUtils.getCommitActionType(tableType); - HoodieTimeline completedTimeline = FlinkClientUtil.createMetaClient(basePath) - .getCommitsTimeline().filterCompletedInstants(); - return completedTimeline.getInstants() - .filter(x -> x.getAction().equals(commitType)) - .map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()).stream() - .max(Comparator.naturalOrder()) - .orElse(null); - } - - public void transitionRequestedToInflight(String commitType, String inFlightInstant) { - HoodieActiveTimeline activeTimeline = FlinkClientUtil.createMetaClient(basePath).getActiveTimeline(); - HoodieInstant requested = new HoodieInstant(HoodieInstant.State.REQUESTED, commitType, inFlightInstant); - activeTimeline.transitionRequestedToInflight(requested, Option.empty(), - config.shouldAllowMultiWriteOnSameInstant()); - } - public HoodieFlinkTable getHoodieTable() { return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java index 2fc5af19bf7c0..62f8d4fa03ad2 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java @@ -20,29 +20,41 @@ import org.apache.hudi.client.FlinkTaskContextSupplier; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieAtomicLongAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.EngineProperty; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.function.SerializableBiFunction; import org.apache.hudi.common.function.SerializableConsumer; import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; import org.apache.flink.api.common.functions.RuntimeContext; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.util.FlinkClientUtil; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; /** * A flink engine implementation of HoodieEngineContext. @@ -65,6 +77,21 @@ public HoodieFlinkEngineContext(SerializableConfiguration hadoopConf, TaskContex this.runtimeContext = ((FlinkTaskContextSupplier) taskContextSupplier).getFlinkRuntimeContext(); } + @Override + public HoodieAccumulator newAccumulator() { + return HoodieAtomicLongAccumulator.create(); + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieList.of(Collections.emptyList()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieList.of(data); + } + public RuntimeContext getRuntimeContext() { return this.runtimeContext; } @@ -74,6 +101,36 @@ public List map(List data, SerializableFunction func, int par return data.stream().parallel().map(throwingMapWrapper(func)).collect(Collectors.toList()); } + @Override + public List mapToPairAndReduceByKey(List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() + .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( + Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + .filter(Objects::nonNull); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel() + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(Collectors.toList()); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/common/model/EventTimeAvroPayload.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/common/model/EventTimeAvroPayload.java new file mode 100644 index 0000000000000..7c8efb66e5cb6 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/common/model/EventTimeAvroPayload.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; + +import java.io.IOException; +import java.util.Map; +import java.util.Properties; + +import static org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro; + +/** + * The only difference with {@link DefaultHoodieRecordPayload} is that is does not + * track the event time metadata for efficiency. + */ +public class EventTimeAvroPayload extends DefaultHoodieRecordPayload { + + public EventTimeAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public EventTimeAvroPayload(Option record) { + this(record.isPresent() ? record.get() : null, 0); // natural order + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException { + if (recordBytes.length == 0) { + return Option.empty(); + } + + GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); + + // Null check is needed here to support schema evolution. The record in storage may be from old schema where + // the new ordering column might not be present and hence returns null. + if (!needUpdatingPersistedRecord(currentValue, incomingRecord, properties)) { + return Option.of(currentValue); + } + + /* + * Now check if the incoming record is a delete record. + */ + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + @Override + public Option getInsertValue(Schema schema, Properties properties) throws IOException { + if (recordBytes.length == 0) { + return Option.empty(); + } + GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); + + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + @Override + public Option> getMetadata() { + return Option.empty(); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java index 8769f63e37258..b0674b2a134d0 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java @@ -65,7 +65,7 @@ protected List computeNext() { try { final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema()); bufferedIteratorExecutor = - new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(inputItr), Option.of(getInsertHandler()), getTransformFunction(schema)); + new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(inputItr), Option.of(getInsertHandler()), getTransformFunction(schema, hoodieConfig)); final List result = bufferedIteratorExecutor.execute(); assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining(); return result; diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java index 272da8c6c8816..847a2183a156d 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java @@ -7,32 +7,29 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.index; import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.simple.FlinkHoodieSimpleIndex; -import org.apache.hudi.index.bloom.FlinkHoodieBloomIndex; -import org.apache.hudi.index.state.FlinkInMemoryStateIndex; -import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.table.HoodieTable; import java.util.List; @@ -46,38 +43,33 @@ protected FlinkHoodieIndex(HoodieWriteConfig config) { super(config); } - public static HoodieIndex createIndex(HoodieFlinkEngineContext context, HoodieWriteConfig config) { - // first use index class config to create index. - if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { - Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); - if (!(instance instanceof HoodieIndex)) { - throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); - } - return (FlinkHoodieIndex) instance; - } - - // TODO more indexes to be added - switch (config.getIndexType()) { - case INMEMORY: - return new FlinkInMemoryStateIndex<>(context, config); - case BLOOM: - return new FlinkHoodieBloomIndex(config); - case SIMPLE: - return new FlinkHoodieSimpleIndex<>(config); - default: - throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); - } - } - @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List updateLocation(List writeStatuses, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieEngineContext context, + HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List> tagLocation(List> records, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieEngineContext context, + HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieList.of(tagLocation(HoodieList.getList(records), context, hoodieTable)); + } + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieList.of(updateLocation(HoodieList.getList(writeStatuses), context, hoodieTable)); + } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java new file mode 100644 index 0000000000000..a9196ca9a3d20 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.ListBasedHoodieBloomIndexHelper; +import org.apache.hudi.index.simple.HoodieSimpleIndex; +import org.apache.hudi.index.state.FlinkInMemoryStateIndex; + +/** + * A factory to generate Flink {@link HoodieIndex}. + */ +public final class FlinkHoodieIndexFactory { + public static HoodieIndex createIndex(HoodieFlinkEngineContext context, HoodieWriteConfig config) { + // first use index class config to create index. + if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { + Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); + if (!(instance instanceof HoodieIndex)) { + throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); + } + return (HoodieIndex) instance; + } + + // TODO more indexes to be added + switch (config.getIndexType()) { + case INMEMORY: + return new FlinkInMemoryStateIndex<>(context, config); + case BLOOM: + return new HoodieBloomIndex<>(config, ListBasedHoodieBloomIndexHelper.getInstance()); + case SIMPLE: + return new HoodieSimpleIndex<>(config, Option.empty()); + default: + throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/simple/FlinkHoodieSimpleIndex.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/simple/FlinkHoodieSimpleIndex.java deleted file mode 100644 index 530bdbf163a7c..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/simple/FlinkHoodieSimpleIndex.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.simple; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.FlinkHoodieIndex; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.io.HoodieKeyLocationFetchHandle; -import org.apache.hudi.table.HoodieTable; - -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; - -/** - * A simple index which reads interested fields(record key and partition path) from base files and - * compares with incoming records to find the tagged location. - * - * @param type of payload - */ -public class FlinkHoodieSimpleIndex extends FlinkHoodieIndex { - - public FlinkHoodieSimpleIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public List updateLocation(List writeStatuses, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException { - return writeStatuses; - } - - @Override - public boolean rollbackCommit(String instantTime) { - return true; - } - - @Override - public boolean isGlobal() { - return false; - } - - @Override - public boolean canIndexLogFiles() { - return false; - } - - @Override - public boolean isImplicitWithStorage() { - return true; - } - - @Override - public List> tagLocation(List> hoodieRecords, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException { - return tagLocationInternal(hoodieRecords, context, hoodieTable); - } - - /** - * Tags records location for incoming records. - */ - private List> tagLocationInternal(List> hoodieRecords, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) { - Map> keyedInputRecords = context.mapToPair(hoodieRecords, record -> Pair.of(record.getKey(), record), 0); - Map existingLocationsOnTable = fetchRecordLocationsForAffectedPartitions(keyedInputRecords.keySet(), context, hoodieTable, config.getSimpleIndexParallelism()); - List> taggedRecords = new LinkedList<>(); - - for (Map.Entry> hoodieKeyHoodieRecordEntry : keyedInputRecords.entrySet()) { - HoodieKey key = hoodieKeyHoodieRecordEntry.getKey(); - HoodieRecord record = hoodieKeyHoodieRecordEntry.getValue(); - if (existingLocationsOnTable.containsKey(key)) { - taggedRecords.add(HoodieIndexUtils.getTaggedRecord(record, Option.ofNullable(existingLocationsOnTable.get(key)))); - } - } - return taggedRecords; - } - - /** - * Fetch record locations for passed in {@link HoodieKey}s. - * - * @param keySet {@link HoodieKey}s for which locations are fetched - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} of interest - * @param parallelism parallelism to use - * @return {@link Map} of {@link HoodieKey} and {@link HoodieRecordLocation} - */ - private Map fetchRecordLocationsForAffectedPartitions(Set keySet, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable, - int parallelism) { - List affectedPartitionPathList = keySet.stream().map(HoodieKey::getPartitionPath).distinct().collect(Collectors.toList()); - List> latestBaseFiles = getLatestBaseFilesForAllPartitions(affectedPartitionPathList, context, hoodieTable); - return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); - } - - private Map fetchRecordLocations(HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable, - int parallelism, - List> latestBaseFiles) { - - List>, List, List>> hoodieKeyLocationFetchHandles = - context.map(latestBaseFiles, partitionPathBaseFile -> new HoodieKeyLocationFetchHandle<>(config, hoodieTable, partitionPathBaseFile, Option.empty()), parallelism); - Map recordLocations = new HashMap<>(); - hoodieKeyLocationFetchHandles.stream() - .flatMap(handle -> handle.locations()) - .forEach(x -> recordLocations.put(x.getKey(), x.getRight())); - return recordLocations; - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java index 118a0e729b90d..aa779c4252fcd 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java @@ -20,13 +20,14 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.FlinkHoodieIndex; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; import org.apache.log4j.LogManager; @@ -39,7 +40,8 @@ * * @param type of payload */ -public class FlinkInMemoryStateIndex extends FlinkHoodieIndex { +public class FlinkInMemoryStateIndex> + extends HoodieIndex>, List, List> { private static final Logger LOG = LogManager.getLogger(FlinkInMemoryStateIndex.class); @@ -48,16 +50,16 @@ public FlinkInMemoryStateIndex(HoodieFlinkEngineContext context, HoodieWriteConf } @Override - public List> tagLocation(List> records, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException { + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { throw new UnsupportedOperationException("No need to tag location for FlinkInMemoryStateIndex"); } @Override - public List updateLocation(List writeStatuses, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException { + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { throw new UnsupportedOperationException("No need to update location for FlinkInMemoryStateIndex"); } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkAppendHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkAppendHandle.java index 9cb6337003cea..1872637aeefde 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkAppendHandle.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkAppendHandle.java @@ -88,11 +88,6 @@ protected boolean isUpdateRecord(HoodieRecord hoodieRecord) { && hoodieRecord.getCurrentLocation().getInstantTime().equals("U"); } - @Override - public boolean canWrite(HoodieRecord record) { - return true; - } - @Override public List close() { try { diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatAndReplaceHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatAndReplaceHandle.java new file mode 100644 index 0000000000000..300e8c512bb34 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatAndReplaceHandle.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; + +/** + * A {@link FlinkMergeAndReplaceHandle} that supports CONCAT write incrementally(small data buffers). + * + *

The records iterator for super constructor is reset as empty thus the initialization for new records + * does nothing. This handle keep the iterator for itself to override the write behavior. + */ +public class FlinkConcatAndReplaceHandle + extends FlinkMergeAndReplaceHandle { + private static final Logger LOG = LoggerFactory.getLogger(FlinkConcatAndReplaceHandle.class); + + // a representation of incoming records that tolerates duplicate keys + private final Iterator> recordItr; + + public FlinkConcatAndReplaceHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier, Path basePath) { + super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier, basePath); + this.recordItr = recordItr; + } + + /** + * Write old record as is w/o merging with incoming record. + */ + @Override + public void write(GenericRecord oldRecord) { + String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt); + try { + fileWriter.writeAvro(key, oldRecord); + } catch (IOException | RuntimeException e) { + String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", + key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); + LOG.debug("Old record is " + oldRecord); + throw new HoodieUpsertException(errMsg, e); + } + recordsWritten++; + } + + @Override + protected void writeIncomingRecords() throws IOException { + while (recordItr.hasNext()) { + HoodieRecord record = recordItr.next(); + writeInsertRecord(record); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatHandle.java new file mode 100644 index 0000000000000..812155c3d2fb0 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatHandle.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.generic.GenericRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; + +/** + * Handle to concatenate new records to old records w/o any merging. + * + *

The records iterator for super constructor is reset as empty thus the initialization for new records + * does nothing. This handle keep the iterator for itself to override the write behavior. + */ +public class FlinkConcatHandle + extends FlinkMergeHandle { + private static final Logger LOG = LoggerFactory.getLogger(FlinkConcatHandle.class); + + // a representation of incoming records that tolerates duplicate keys + private final Iterator> recordItr; + + public FlinkConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier) { + super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier); + this.recordItr = recordItr; + } + + /** + * Write old record as is w/o merging with incoming record. + */ + @Override + public void write(GenericRecord oldRecord) { + String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt); + try { + fileWriter.writeAvro(key, oldRecord); + } catch (IOException | RuntimeException e) { + String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", + key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); + LOG.debug("Old record is " + oldRecord); + throw new HoodieUpsertException(errMsg, e); + } + recordsWritten++; + } + + @Override + protected void writeIncomingRecords() throws IOException { + while (recordItr.hasNext()) { + HoodieRecord record = recordItr.next(); + writeInsertRecord(record); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetSchemaConverter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetSchemaConverter.java index 80fda29aa4756..5187660c8caec 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetSchemaConverter.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetSchemaConverter.java @@ -564,7 +564,7 @@ private static Type convertToParquetType( int scale = ((DecimalType) type).getScale(); int numBytes = computeMinBytesForDecimalPrecision(precision); return Types.primitive( - PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, repetition) + PrimitiveType.PrimitiveTypeName.BINARY, repetition) .precision(precision) .scale(scale) .length(numBytes) diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java index 458af1a402f4c..8254d0b884616 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java @@ -18,27 +18,20 @@ package org.apache.hudi.metadata; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieMetadataException; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.table.HoodieTable; import org.apache.hadoop.conf.Configuration; import org.apache.log4j.LogManager; @@ -47,19 +40,29 @@ import java.io.IOException; import java.util.Collections; import java.util.List; -import java.util.Map; import java.util.stream.Collectors; public class FlinkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter { private static final Logger LOG = LogManager.getLogger(FlinkHoodieBackedTableMetadataWriter.class); - public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, HoodieEngineContext context) { - return new FlinkHoodieBackedTableMetadataWriter(conf, writeConfig, context); + public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, + HoodieEngineContext context) { + return create(conf, writeConfig, context, Option.empty()); } - FlinkHoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteConfig writeConfig, HoodieEngineContext engineContext) { - super(hadoopConf, writeConfig, engineContext); + public static HoodieTableMetadataWriter create(Configuration conf, + HoodieWriteConfig writeConfig, + HoodieEngineContext context, + Option actionMetadata) { + return new FlinkHoodieBackedTableMetadataWriter(conf, writeConfig, context, actionMetadata); + } + + FlinkHoodieBackedTableMetadataWriter(Configuration hadoopConf, + HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext, + Option actionMetadata) { + super(hadoopConf, writeConfig, engineContext, actionMetadata); } @Override @@ -74,10 +77,11 @@ protected void initRegistry() { } @Override - protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient) { + protected void initialize(HoodieEngineContext engineContext, + Option actionMetadata) { try { if (enabled) { - bootstrapIfNeeded(engineContext, datasetMetaClient); + bootstrapIfNeeded(engineContext, dataMetaClient, actionMetadata); } } catch (IOException e) { LOG.error("Failed to initialize metadata table. Disabling the writer.", e); @@ -86,88 +90,65 @@ protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClie } @Override - protected void commit(List records, String partitionName, String instantTime) { + protected void commit(List records, String partitionName, String instantTime, boolean canTriggerTableService) { ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled"); - List recordRDD = prepRecords(records, partitionName); + List recordList = prepRecords(records, partitionName, 1); - try (HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient(engineContext, metadataWriteConfig, true)) { - writeClient.startCommitWithTime(instantTime); - writeClient.transitionRequestedToInflight(HoodieActiveTimeline.DELTA_COMMIT_ACTION, instantTime); + try (HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient(engineContext, metadataWriteConfig)) { + if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) { + // if this is a new commit being applied to metadata for the first time + writeClient.startCommitWithTime(instantTime); + metadataMetaClient.getActiveTimeline().transitionRequestedToInflight(HoodieActiveTimeline.DELTA_COMMIT_ACTION, instantTime); + } else { + // this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable. + // for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable. + // when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes + // are upserts to metadata table and so only a new delta commit will be created. + // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is + // already part of completed commit. So, we have to manually remove the completed instant and proceed. + // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table. + HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get(); + HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant); + metadataMetaClient.reloadActiveTimeline(); + } - List statuses = writeClient.upsertPreppedRecords(recordRDD, instantTime); + List statuses = records.size() > 0 + ? writeClient.upsertPreppedRecords(recordList, instantTime) + : Collections.emptyList(); statuses.forEach(writeStatus -> { if (writeStatus.hasErrors()) { throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime); } }); + // flink does not support auto-commit yet, also the auto commit logic is not complete as AbstractHoodieWriteClient now. writeClient.commit(instantTime, statuses, Option.empty(), HoodieActiveTimeline.DELTA_COMMIT_ACTION, Collections.emptyMap()); - // trigger cleaning, compaction, with suffixes based on the same instant time. This ensures that any future - // delta commits synced over will not have an instant time lesser than the last completed instant on the - // metadata table. - if (writeClient.scheduleCompactionAtInstant(instantTime + "001", Option.empty())) { - writeClient.compact(instantTime + "001"); + + // reload timeline + metadataMetaClient.reloadActiveTimeline(); + if (canTriggerTableService) { + compactIfNecessary(writeClient, instantTime); + doClean(writeClient, instantTime); } - writeClient.clean(instantTime + "002"); } // Update total size of the metadata and count of base/log files - metrics.ifPresent(m -> { - try { - Map stats = m.getStats(false, metaClient, metadata); - m.updateMetrics(Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE)), - Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE)), - Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_BASE_FILES)), - Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_LOG_FILES))); - } catch (HoodieIOException e) { - LOG.error("Could not publish metadata size metrics", e); - } - }); + metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata)); } /** - * Tag each record with the location. - *

- * Since we only read the latest base file in a partition, we tag the records with the instant time of the latest - * base file. + * Tag each record with the location in the given partition. + * + * The record is tagged with respective file slice's location based on its record key. */ - private List prepRecords(List records, String partitionName) { - HoodieTable table = HoodieFlinkTable.create(metadataWriteConfig, (HoodieFlinkEngineContext) engineContext); - TableFileSystemView.SliceView fsView = table.getSliceView(); - List baseFiles = fsView.getLatestFileSlices(partitionName) - .map(FileSlice::getBaseFile) - .filter(Option::isPresent) - .map(Option::get) - .collect(Collectors.toList()); - - // All the metadata fits within a single base file - if (partitionName.equals(MetadataPartitionType.FILES.partitionPath())) { - if (baseFiles.size() > 1) { - throw new HoodieMetadataException("Multiple base files found in metadata partition"); - } - } - - String fileId; - String instantTime; - if (!baseFiles.isEmpty()) { - fileId = baseFiles.get(0).getFileId(); - instantTime = "U"; - } else { - // If there is a log file then we can assume that it has the data - List logFiles = fsView.getLatestFileSlices(MetadataPartitionType.FILES.partitionPath()) - .map(FileSlice::getLatestLogFile) - .filter(Option::isPresent) - .map(Option::get) - .collect(Collectors.toList()); - if (logFiles.isEmpty()) { - // No base and log files. All are new inserts - fileId = FSUtils.createNewFileIdPfx(); - instantTime = "I"; - } else { - fileId = logFiles.get(0).getFileId(); - instantTime = "U"; - } - } - - return records.stream().map(r -> r.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId))).collect(Collectors.toList()); + private List prepRecords(List records, String partitionName, int numFileGroups) { + List fileSlices = HoodieTableMetadataUtil.loadPartitionFileGroupsWithLatestFileSlices(metadataMetaClient, partitionName); + ValidationUtils.checkArgument(fileSlices.size() == numFileGroups, String.format("Invalid number of file groups: found=%d, required=%d", fileSlices.size(), numFileGroups)); + + return records.stream().map(r -> { + FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), numFileGroups)); + final String instantTime = slice.isEmpty() ? "I" : "U"; + r.setCurrentLocation(new HoodieRecordLocation(instantTime, slice.getFileId())); + return r; + }).collect(Collectors.toList()); } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java index 8a9b4bf9b8206..85ad1364de9ca 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java @@ -24,6 +24,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -31,8 +32,10 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieNotSupportedException; @@ -43,8 +46,8 @@ import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; -import org.apache.hudi.table.action.clean.FlinkCleanActionExecutor; -import org.apache.hudi.table.action.clean.FlinkScheduleCleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; import org.apache.hudi.table.action.commit.FlinkDeleteCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkInsertOverwriteCommitActionExecutor; @@ -53,7 +56,8 @@ import org.apache.hudi.table.action.commit.FlinkMergeHelper; import org.apache.hudi.table.action.commit.FlinkUpsertCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkUpsertPreppedCommitActionExecutor; -import org.apache.hudi.table.action.rollback.FlinkCopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -72,7 +76,8 @@ *

* UPDATES - Produce a new version of the file, just replacing the updated records with new values */ -public class HoodieFlinkCopyOnWriteTable extends HoodieFlinkTable { +public class HoodieFlinkCopyOnWriteTable + extends HoodieFlinkTable implements HoodieCompactionHandler { private static final Logger LOG = LoggerFactory.getLogger(HoodieFlinkCopyOnWriteTable.class); @@ -80,6 +85,11 @@ public HoodieFlinkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext super(config, context, metaClient); } + @Override + public boolean isTableServiceAction(String actionType) { + return !actionType.equals(HoodieTimeline.COMMIT_ACTION); + } + /** * Upsert a batch of new records into Hoodie table at the supplied instantTime. * @@ -229,6 +239,11 @@ public HoodieWriteMetadata deletePartitions(HoodieEngineContext context, String throw new HoodieNotSupportedException("DeletePartitions is not supported yet"); } + @Override + public void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation) { + throw new HoodieNotSupportedException("update statistics is not supported yet"); + } + @Override public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, List> preppedRecords) { throw new HoodieNotSupportedException("This method should not be invoked"); @@ -263,7 +278,8 @@ public Option scheduleCompaction(HoodieEngineContext conte } @Override - public HoodieWriteMetadata> compact(HoodieEngineContext context, String compactionInstantTime) { + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); } @@ -295,17 +311,24 @@ public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { */ @Override public Option scheduleCleaning(HoodieEngineContext context, String instantTime, Option> extraMetadata) { - return new FlinkScheduleCleanActionExecutor(context, config, this, instantTime, extraMetadata).execute(); + return new CleanPlanActionExecutor(context, config, this, instantTime, extraMetadata).execute(); } @Override - public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime) { - return new FlinkCleanActionExecutor(context, config, this, cleanInstantTime).execute(); + public Option scheduleRollback(HoodieEngineContext context, String instantTime, HoodieInstant instantToRollback, + boolean skipTimelinePublish) { + return new BaseRollbackPlanActionExecutor(context, config, this, instantTime, instantToRollback, skipTimelinePublish).execute(); } @Override - public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, boolean deleteInstants) { - return new FlinkCopyOnWriteRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime, boolean skipLocking) { + return new CleanActionExecutor(context, config, this, cleanInstantTime).execute(); + } + + @Override + public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, + boolean deleteInstants, boolean skipLocking) { + return new CopyOnWriteRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants, skipLocking).execute(); } @Override @@ -321,9 +344,10 @@ public HoodieRestoreMetadata restore(HoodieEngineContext context, String restore // ------------------------------------------------------------------------- // Used for compaction // ------------------------------------------------------------------------- - - public Iterator> handleUpdate(String instantTime, String partitionPath, String fileId, - Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException { + @Override + public Iterator> handleUpdate( + String instantTime, String partitionPath, String fileId, + Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException { // these are updates HoodieMergeHandle upsertHandle = getUpdateHandle(instantTime, partitionPath, fileId, keyToNewRecords, oldDataFile); return handleUpdateInternal(upsertHandle, instantTime, fileId); @@ -358,9 +382,11 @@ protected HoodieMergeHandle getUpdateHandle(String instantTime, String partition } } - public Iterator> handleInsert(String instantTime, String partitionPath, String fileId, - Map> recordMap) { - HoodieCreateHandle createHandle = + @Override + public Iterator> handleInsert( + String instantTime, String partitionPath, String fileId, + Map> recordMap) { + HoodieCreateHandle createHandle = new HoodieCreateHandle(config, instantTime, this, partitionPath, fileId, recordMap, taskContextSupplier); createHandle.write(); return Collections.singletonList(createHandle.close()).iterator(); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java index bfe8b6f4999cd..5ad87e0831e97 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java @@ -20,24 +20,27 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.io.FlinkAppendHandle; import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.delta.FlinkUpsertDeltaCommitActionExecutor; import org.apache.hudi.table.action.commit.delta.FlinkUpsertPreppedDeltaCommitActionExecutor; -import org.apache.hudi.table.action.compact.BaseScheduleCompactionActionExecutor; -import org.apache.hudi.table.action.compact.FlinkScheduleCompactionActionExecutor; -import org.apache.hudi.table.action.rollback.FlinkMergeOnReadRollbackActionExecutor; +import org.apache.hudi.table.action.compact.HoodieFlinkMergeOnReadTableCompactor; +import org.apache.hudi.table.action.compact.RunCompactionActionExecutor; +import org.apache.hudi.table.action.compact.ScheduleCompactionActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor; import java.util.List; import java.util.Map; @@ -52,6 +55,11 @@ public class HoodieFlinkMergeOnReadTable super(config, context, metaClient); } + @Override + public boolean isTableServiceAction(String actionType) { + return !actionType.equals(HoodieTimeline.DELTA_COMMIT_ACTION); + } + @Override public HoodieWriteMetadata> upsert( HoodieEngineContext context, @@ -95,20 +103,32 @@ public Option scheduleCompaction( HoodieEngineContext context, String instantTime, Option> extraMetadata) { - BaseScheduleCompactionActionExecutor scheduleCompactionExecutor = new FlinkScheduleCompactionActionExecutor( - context, config, this, instantTime, extraMetadata); + ScheduleCompactionActionExecutor scheduleCompactionExecutor = new ScheduleCompactionActionExecutor( + context, config, this, instantTime, extraMetadata, + new HoodieFlinkMergeOnReadTableCompactor()); return scheduleCompactionExecutor.execute(); } @Override - public HoodieWriteMetadata> compact(HoodieEngineContext context, String compactionInstantTime) { - throw new HoodieNotSupportedException("Compaction is supported as a separate pipeline, " - + "should not invoke directly through HoodieFlinkMergeOnReadTable"); + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { + RunCompactionActionExecutor compactionExecutor = new RunCompactionActionExecutor( + context, config, this, compactionInstantTime, new HoodieFlinkMergeOnReadTableCompactor(), + new HoodieFlinkCopyOnWriteTable(config, context, getMetaClient())); + return convertMetadata(compactionExecutor.execute()); + } + + @Override + public Option scheduleRollback(HoodieEngineContext context, String instantTime, HoodieInstant instantToRollback, + boolean skipTimelinePublish) { + return new BaseRollbackPlanActionExecutor(context, config, this, instantTime, instantToRollback, skipTimelinePublish).execute(); } @Override - public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, boolean deleteInstants) { - return new FlinkMergeOnReadRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, + boolean deleteInstants, boolean skipLocking) { + return new MergeOnReadRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants, + skipLocking).execute(); } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java index 3e26025c258bb..f00781f8fa695 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java @@ -18,8 +18,10 @@ package org.apache.hudi.table; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -29,14 +31,28 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.index.FlinkHoodieIndex; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.index.FlinkHoodieIndexFactory; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.metadata.FlinkHoodieBackedTableMetadataWriter; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; import java.util.List; +import static org.apache.hudi.common.data.HoodieList.getList; + public abstract class HoodieFlinkTable extends HoodieTable>, List, List> implements ExplicitWriteHandleTable { + + private boolean isMetadataAvailabilityUpdated = false; + private boolean isMetadataTableAvailable; + protected HoodieFlinkTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { super(config, context, metaClient); } @@ -52,18 +68,64 @@ public static HoodieFlinkTable create(HoodieW public static HoodieFlinkTable create(HoodieWriteConfig config, HoodieFlinkEngineContext context, HoodieTableMetaClient metaClient) { + return HoodieFlinkTable.create(config, context, metaClient, config.isMetadataTableEnabled()); + } + + public static HoodieFlinkTable create(HoodieWriteConfig config, + HoodieFlinkEngineContext context, + HoodieTableMetaClient metaClient, + boolean refreshTimeline) { + final HoodieFlinkTable hoodieFlinkTable; switch (metaClient.getTableType()) { case COPY_ON_WRITE: - return new HoodieFlinkCopyOnWriteTable<>(config, context, metaClient); + hoodieFlinkTable = new HoodieFlinkCopyOnWriteTable<>(config, context, metaClient); + break; case MERGE_ON_READ: - return new HoodieFlinkMergeOnReadTable<>(config, context, metaClient); + hoodieFlinkTable = new HoodieFlinkMergeOnReadTable<>(config, context, metaClient); + break; default: throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } + if (refreshTimeline) { + hoodieFlinkTable.getHoodieView().sync(); + } + return hoodieFlinkTable; + } + + public static HoodieWriteMetadata> convertMetadata( + HoodieWriteMetadata> metadata) { + return metadata.clone(getList(metadata.getWriteStatuses())); + } + + @Override + protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context) { + return FlinkHoodieIndexFactory.createIndex((HoodieFlinkEngineContext) context, config); } + /** + * Fetch instance of {@link HoodieTableMetadataWriter}. + * + * @return instance of {@link HoodieTableMetadataWriter} + */ @Override - protected HoodieIndex>, List, List> getIndex(HoodieWriteConfig config, HoodieEngineContext context) { - return FlinkHoodieIndex.createIndex((HoodieFlinkEngineContext) context, config); + public Option getMetadataWriter(Option actionMetadata) { + synchronized (this) { + if (!isMetadataAvailabilityUpdated) { + // This code assumes that if metadata availability is updated once it will not change. + // Please revisit this logic if that's not the case. This is done to avoid repeated calls to fs.exists(). + try { + isMetadataTableAvailable = config.isMetadataTableEnabled() + && metaClient.getFs().exists(new Path(HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath()))); + } catch (IOException e) { + throw new HoodieMetadataException("Checking existence of metadata table failed", e); + } + isMetadataAvailabilityUpdated = true; + } + } + if (isMetadataTableAvailable) { + return Option.of(FlinkHoodieBackedTableMetadataWriter.create(context.getHadoopConf().get(), config, context)); + } else { + return Option.empty(); + } } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkCleanActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkCleanActionExecutor.java deleted file mode 100644 index 9378cb2304b79..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkCleanActionExecutor.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import org.apache.hudi.avro.model.HoodieActionInstant; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieCleanStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.CleanFileInfo; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import scala.Tuple2; - -public class FlinkCleanActionExecutor extends - BaseCleanActionExecutor>, List, List> { - - private static final Logger LOG = LogManager.getLogger(FlinkCleanActionExecutor.class); - - public FlinkCleanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime) { - super(context, config, table, instantTime); - } - - @Override - List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) { - Stream> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream() - .flatMap(x -> x.getValue().stream().map(y -> new Tuple2<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile())))); - - Stream> partitionCleanStats = - deleteFilesFunc(filesToBeDeletedPerPartition, table) - .collect(Collectors.groupingBy(Pair::getLeft)) - .entrySet().stream() - .map(x -> new Tuple2(x.getKey(), x.getValue().stream().map(y -> y.getRight()).reduce(PartitionCleanStat::merge).get())); - - Map partitionCleanStatsMap = partitionCleanStats - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); - - // Return PartitionCleanStat for each partition passed. - return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> { - PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) - ? partitionCleanStatsMap.get(partitionPath) - : new PartitionCleanStat(partitionPath); - HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain(); - return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath) - .withEarliestCommitRetained(Option.ofNullable( - actionInstant != null - ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), - actionInstant.getAction(), actionInstant.getTimestamp()) - : null)) - .withDeletePathPattern(partitionCleanStat.deletePathPatterns()) - .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()) - .withFailedDeletes(partitionCleanStat.failedDeleteFiles()) - .withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()) - .withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()) - .withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()) - .build(); - }).collect(Collectors.toList()); - } - - private static Stream> deleteFilesFunc(Stream> cleanFileInfo, HoodieTable table) { - Map partitionCleanStatMap = new HashMap<>(); - FileSystem fs = table.getMetaClient().getFs(); - - cleanFileInfo.parallel().forEach(partitionDelFileTuple -> { - String partitionPath = partitionDelFileTuple._1(); - Path deletePath = new Path(partitionDelFileTuple._2().getFilePath()); - String deletePathStr = deletePath.toString(); - Boolean deletedFileResult = null; - try { - deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); - } catch (IOException e) { - LOG.error("Delete file failed"); - } - final PartitionCleanStat partitionCleanStat; - synchronized (partitionCleanStatMap) { - partitionCleanStat = partitionCleanStatMap.computeIfAbsent(partitionPath, k -> new PartitionCleanStat(partitionPath)); - } - boolean isBootstrapBasePathFile = partitionDelFileTuple._2().isBootstrapBaseFile(); - if (isBootstrapBasePathFile) { - // For Bootstrap Base file deletions, store the full file path. - partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true); - partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true); - } else { - partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false); - partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false); - } - }); - return partitionCleanStatMap.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkScheduleCleanActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkScheduleCleanActionExecutor.java deleted file mode 100644 index 75da54e31cf9d..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkScheduleCleanActionExecutor.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import java.util.List; -import java.util.Map; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -public class FlinkScheduleCleanActionExecutor extends - BaseCleanPlanActionExecutor>, List, List> { - - private static final Logger LOG = LogManager.getLogger(FlinkScheduleCleanActionExecutor.class); - - public FlinkScheduleCleanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } - - @Override - protected Option createCleanerPlan() { - return super.execute(); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java index 5cfd28be2c1dc..5dfa511a8823f 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java @@ -141,13 +141,14 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta result.setWriteStats(writeStats); // Finalize write finalizeWrite(instantTime, writeStats, result); - syncTableMetadata(); try { LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType()); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + writeTableMetadata(metadata, actionType); + activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); LOG.info("Committed " + instantTime); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java index 185482642ead3..05ac93725bfc9 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; import org.apache.hudi.common.model.HoodieKey; @@ -95,8 +96,8 @@ public HoodieWriteMetadata> execute(String instantTime, dedupedKeys.stream().map(key -> new HoodieRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); Instant beginTag = Instant.now(); // perform index look up to get existing location of records - List> taggedRecords = - table.getIndex().tagLocation(dedupedRecords, context, table); + List> taggedRecords = HoodieList.getList( + table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table)); Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); // filter out non existent keys/records diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java index 5cb1b80a5136e..0863ad8e48e09 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; @@ -80,9 +81,14 @@ public HoodieWriteMetadata> write(String instantTime, List> deduplicateRecords(List> records, - HoodieIndex>, List, List> index, - int parallelism) { + protected List> tag(List> dedupedRecords, HoodieEngineContext context, HoodieTable>, List, List> table) { + return HoodieList.getList( + table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table)); + } + + @Override + public List> deduplicateRecords( + List> records, HoodieIndex index, int parallelism) { Map>>> keyedRecords = records.stream().map(record -> { // If index used is global, then records are expected to differ in their partitionPath final Object key = record.getKey().getRecordKey(); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/FlinkCompactHelpers.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/FlinkCompactHelpers.java deleted file mode 100644 index 68a42a5578743..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/FlinkCompactHelpers.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.HoodieFlinkWriteClient; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.CompactionOperation; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.table.HoodieFlinkCopyOnWriteTable; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.table.HoodieTable; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import static java.util.stream.Collectors.toList; - -/** - * A flink implementation of {@link AbstractCompactHelpers}. - * - * @param - */ -public class FlinkCompactHelpers extends - AbstractCompactHelpers>, List, List> { - private static final Logger LOG = LoggerFactory.getLogger(FlinkCompactHelpers.class); - - private FlinkCompactHelpers() { - } - - private static class CompactHelperHolder { - private static final FlinkCompactHelpers FLINK_COMPACT_HELPERS = new FlinkCompactHelpers(); - } - - public static FlinkCompactHelpers newInstance() { - return CompactHelperHolder.FLINK_COMPACT_HELPERS; - } - - @Override - public HoodieCommitMetadata createCompactionMetadata(HoodieTable>, List, List> table, - String compactionInstantTime, - List writeStatuses, - String schema) throws IOException { - byte[] planBytes = table.getActiveTimeline().readCompactionPlanAsBytes( - HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get(); - HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan(planBytes); - List updateStatusMap = writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()); - org.apache.hudi.common.model.HoodieCommitMetadata metadata = new org.apache.hudi.common.model.HoodieCommitMetadata(true); - for (HoodieWriteStat stat : updateStatusMap) { - metadata.addWriteStat(stat.getPartitionPath(), stat); - } - metadata.addMetadata(org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY, schema); - if (compactionPlan.getExtraMetadata() != null) { - compactionPlan.getExtraMetadata().forEach(metadata::addMetadata); - } - return metadata; - } - - @SuppressWarnings("unchecked, rawtypes") - public static List compact( - HoodieFlinkWriteClient writeClient, - String compactInstantTime, - CompactionOperation compactionOperation) throws IOException { - HoodieFlinkMergeOnReadTableCompactor compactor = new HoodieFlinkMergeOnReadTableCompactor(); - return compactor.compact( - new HoodieFlinkCopyOnWriteTable<>( - writeClient.getConfig(), - writeClient.getEngineContext(), - writeClient.getHoodieTable().getMetaClient()), - writeClient.getHoodieTable().getMetaClient(), - writeClient.getConfig(), - compactionOperation, - compactInstantTime); - } - - /** - * Called by the metadata table compactor code path. - */ - @SuppressWarnings("unchecked, rawtypes") - public static List compact(String compactionInstantTime, HoodieFlinkWriteClient writeClient) throws IOException { - HoodieFlinkTable table = writeClient.getHoodieTable(); - HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); - HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); - if (pendingCompactionTimeline.containsInstant(inflightInstant)) { - writeClient.rollbackInflightCompaction(inflightInstant, table); - table.getMetaClient().reloadActiveTimeline(); - } - - // generate compaction plan - // should support configurable commit metadata - HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan( - table.getMetaClient(), compactionInstantTime); - - if (compactionPlan == null || (compactionPlan.getOperations() == null) - || (compactionPlan.getOperations().isEmpty())) { - // do nothing. - LOG.info("No compaction plan for instant " + compactionInstantTime); - return Collections.emptyList(); - } else { - HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); - // Mark instant as compaction inflight - table.getActiveTimeline().transitionCompactionRequestedToInflight(instant); - table.getMetaClient().reloadActiveTimeline(); - - List operations = compactionPlan.getOperations().stream() - .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList()); - LOG.info("Compacting " + operations + " files"); - List writeStatusList = new ArrayList<>(); - for (CompactionOperation operation : operations) { - List statuses = compact(writeClient, compactionInstantTime, operation); - writeStatusList.addAll(statuses); - } - return writeStatusList; - } - } -} - diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/HoodieFlinkMergeOnReadTableCompactor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/HoodieFlinkMergeOnReadTableCompactor.java index 1f4a5248411f0..03b9f8e7ee090 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/HoodieFlinkMergeOnReadTableCompactor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/HoodieFlinkMergeOnReadTableCompactor.java @@ -18,54 +18,17 @@ package org.apache.hudi.table.action.compact; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieCompactionOperation; -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.FlinkTaskContextSupplier; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.CompactionOperation; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.IOUtils; -import org.apache.hudi.table.HoodieFlinkCopyOnWriteTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; import java.util.List; -import java.util.Set; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; - -import static java.util.stream.Collectors.toList; /** * Compacts a hoodie table with merge on read storage. Computes all possible compactions, @@ -75,164 +38,21 @@ *

Note: the compaction logic is invoked through the flink pipeline. */ @SuppressWarnings("checkstyle:LineLength") -public class HoodieFlinkMergeOnReadTableCompactor implements HoodieCompactor>, List, List> { - - private static final Logger LOG = LogManager.getLogger(HoodieFlinkMergeOnReadTableCompactor.class); - - // Accumulator to keep track of total log files for a table - private AtomicLong totalLogFiles; - // Accumulator to keep track of total log file slices for a table - private AtomicLong totalFileSlices; +public class HoodieFlinkMergeOnReadTableCompactor + extends HoodieCompactor>, List, List> { @Override - public List compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, - HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException { - throw new UnsupportedOperationException("HoodieFlinkMergeOnReadTableCompactor does not support compact directly, " - + "the function works as a separate pipeline"); - } - - public List compact(HoodieFlinkCopyOnWriteTable hoodieCopyOnWriteTable, - HoodieTableMetaClient metaClient, - HoodieWriteConfig config, - CompactionOperation operation, - String instantTime) throws IOException { - FileSystem fs = metaClient.getFs(); - - Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField()); - LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames() - + " for commit " + instantTime); - // TODO - FIX THIS - // Reads the entire avro file. Always only specific blocks should be read from the avro file - // (failure recover). - // Load all the delta commits since the last compaction commit and get all the blocks to be - // loaded and load it using CompositeAvroLogReader - // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. - String maxInstantTime = metaClient - .getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, - HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)) - .filterCompletedInstants().lastInstant().get().getTimestamp(); - // TODO(danny): make it configurable - long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new FlinkTaskContextSupplier(null), config); - LOG.info("MaxMemoryPerCompaction => " + maxMemoryPerCompaction); - - List logFiles = operation.getDeltaFileNames().stream().map( - p -> new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), operation.getPartitionPath()), p).toString()) - .collect(toList()); - HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() - .withFileSystem(fs) - .withBasePath(metaClient.getBasePath()) - .withLogFilePaths(logFiles) - .withReaderSchema(readerSchema) - .withLatestInstantTime(maxInstantTime) - .withMaxMemorySizeInBytes(maxMemoryPerCompaction) - .withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()) - .withReverseReader(config.getCompactionReverseLogReadEnabled()) - .withBufferSize(config.getMaxDFSStreamBufferSize()) - .withSpillableMapBasePath(config.getSpillableMapBasePath()) - .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType()) - .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) - .build(); - if (!scanner.iterator().hasNext()) { - return new ArrayList<>(); - } - - Option oldDataFileOpt = - operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath()); - - // Compacting is very similar to applying updates to existing file - Iterator> result; - // If the dataFile is present, perform updates else perform inserts into a new base file. - if (oldDataFileOpt.isPresent()) { - result = hoodieCopyOnWriteTable.handleUpdate(instantTime, operation.getPartitionPath(), - operation.getFileId(), scanner.getRecords(), - oldDataFileOpt.get()); - } else { - result = hoodieCopyOnWriteTable.handleInsert(instantTime, operation.getPartitionPath(), operation.getFileId(), - scanner.getRecords()); + public void preCompact( + HoodieTable table, HoodieTimeline pendingCompactionTimeline, String compactionInstantTime) { + HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); + if (pendingCompactionTimeline.containsInstant(inflightInstant)) { + table.rollbackInflightCompaction(inflightInstant); + table.getMetaClient().reloadActiveTimeline(); } - Iterable> resultIterable = () -> result; - return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> { - s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog()); - s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); - s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); - s.getStat().setPartitionPath(operation.getPartitionPath()); - s.getStat() - .setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); - s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks()); - s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks()); - s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks()); - RuntimeStats runtimeStats = new RuntimeStats(); - runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks()); - s.getStat().setRuntimeStats(runtimeStats); - scanner.close(); - }).collect(toList()); } @Override - public HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable, - HoodieWriteConfig config, String compactionCommitTime, - Set fgIdsInPendingCompactionAndClustering) - throws IOException { - totalLogFiles = new AtomicLong(0); - totalFileSlices = new AtomicLong(0); - - ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, - "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " - + hoodieTable.getMetaClient().getTableType().name()); - - // TODO : check if maxMemory is not greater than JVM or flink.executor memory - // TODO - rollback any compactions in flight - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); - List partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); - - // filter the partition paths if needed to reduce list status - partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths); - - if (partitionPaths.isEmpty()) { - // In case no partitions could be picked, return no compaction plan - return null; - } - - SliceView fileSystemView = hoodieTable.getSliceView(); - LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); - context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact"); - - List operations = context.flatMap(partitionPaths, partitionPath -> fileSystemView - .getLatestFileSlices(partitionPath) - .filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())) - .map(s -> { - List logFiles = - s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); - totalLogFiles.addAndGet(logFiles.size()); - totalFileSlices.addAndGet(1L); - // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO - // for flink Map operations and collecting them finally in Avro generated classes for storing - // into meta files. - Option dataFile = s.getBaseFile(); - return new CompactionOperation(dataFile, partitionPath, logFiles, - config.getCompactionStrategy().captureMetrics(config, s)); - }) - .filter(c -> !c.getDeltaFileNames().isEmpty()), partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList()); - - LOG.info("Total of " + operations.size() + " compactions are retrieved"); - LOG.info("Total number of latest files slices " + totalFileSlices.get()); - LOG.info("Total number of log files " + totalLogFiles.get()); - LOG.info("Total number of file slices " + totalFileSlices.get()); - // Filter the compactions with the passed in filter. This lets us choose most effective - // compactions only - HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, - CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList())); - ValidationUtils.checkArgument( - compactionPlan.getOperations().stream().noneMatch( - op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), - "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " - + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering - + ", Selected workload :" + compactionPlan); - if (compactionPlan.getOperations().isEmpty()) { - LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath()); - } - return compactionPlan; + public void maybePersist(HoodieData writeStatus, HoodieWriteConfig config) { + // No OP } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkCopyOnWriteRollbackActionExecutor.java deleted file mode 100644 index 47039a3adf20a..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkCopyOnWriteRollbackActionExecutor.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class FlinkCopyOnWriteRollbackActionExecutor extends - BaseCopyOnWriteRollbackActionExecutor>, List, List> { - public FlinkCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public FlinkCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new FlinkMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant instantToRollback) { - List rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW( - context, table.getMetaClient().getBasePath(), config); - return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMarkerBasedRollbackStrategy.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMarkerBasedRollbackStrategy.java deleted file mode 100644 index bb7ec7600a21b..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMarkerBasedRollbackStrategy.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.marker.MarkerBasedRollbackUtils; -import org.apache.hudi.table.marker.WriteMarkers; - -import org.apache.hadoop.fs.FileStatus; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import scala.Tuple2; - -@SuppressWarnings("checkstyle:LineLength") -public class FlinkMarkerBasedRollbackStrategy extends AbstractMarkerBasedRollbackStrategy>, List, List> { - public FlinkMarkerBasedRollbackStrategy(HoodieTable>, List, List> table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) { - super(table, context, config, instantTime); - } - - @Override - public List execute(HoodieInstant instantToRollback) { - try { - List markerPaths = MarkerBasedRollbackUtils.getAllMarkerPaths( - table, context, instantToRollback.getTimestamp(), config.getRollbackParallelism()); - List rollbackStats = context.map(markerPaths, markerFilePath -> { - String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1); - IOType type = IOType.valueOf(typeStr); - switch (type) { - case MERGE: - return undoMerge(WriteMarkers.stripMarkerSuffix(markerFilePath)); - case APPEND: - return undoAppend(WriteMarkers.stripMarkerSuffix(markerFilePath), instantToRollback); - case CREATE: - return undoCreate(WriteMarkers.stripMarkerSuffix(markerFilePath)); - default: - throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); - } - }, 0); - - return rollbackStats.stream().map(rollbackStat -> new Tuple2<>(rollbackStat.getPartitionPath(), rollbackStat)) - .collect(Collectors.groupingBy(Tuple2::_1)) - .values() - .stream() - .map(x -> x.stream().map(y -> y._2).reduce(RollbackUtils::mergeRollbackStat).get()) - .collect(Collectors.toList()); - } catch (Exception e) { - throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); - } - } - - protected Map getWrittenLogFileSizeMap(String partitionPathStr, String baseCommitTime, String fileId) throws IOException { - // collect all log files that is supposed to be deleted with this rollback - return FSUtils.getAllLogFiles(table.getMetaClient().getFs(), - FSUtils.getPartitionPath(config.getBasePath(), partitionPathStr), fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), baseCommitTime) - .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen())); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMergeOnReadRollbackActionExecutor.java deleted file mode 100644 index 25b20a5073ffd..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMergeOnReadRollbackActionExecutor.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.table.HoodieTable; - -import java.io.IOException; -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class FlinkMergeOnReadRollbackActionExecutor extends - BaseMergeOnReadRollbackActionExecutor>, List, List> { - public FlinkMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public FlinkMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new FlinkMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant resolvedInstant) { - List rollbackRequests; - try { - rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(resolvedInstant, table, context); - } catch (IOException e) { - throw new HoodieIOException("Error generating rollback requests by file listing.", e); - } - return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, resolvedInstant, rollbackRequests); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java deleted file mode 100644 index f03b211bf9de8..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.collection.ImmutablePair; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieRollbackException; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.PathFilter; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; - -/** - * Performs Rollback of Hoodie Tables. - */ -public class ListingBasedRollbackHelper implements Serializable { - - private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackHelper.class); - - private final HoodieTableMetaClient metaClient; - private final HoodieWriteConfig config; - - public ListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { - this.metaClient = metaClient; - this.config = config; - } - - /** - * Performs all rollback actions that we have collected in parallel. - */ - public List performRollback(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - Map partitionPathRollbackStatsPairs = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, true); - - Map>> collect = partitionPathRollbackStatsPairs.entrySet() - .stream() - .map(x -> Pair.of(x.getKey(), x.getValue())).collect(Collectors.groupingBy(Pair::getLeft)); - return collect.values().stream() - .map(pairs -> pairs.stream().map(Pair::getRight).reduce(RollbackUtils::mergeRollbackStat).orElse(null)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } - - /** - * Collect all file info that needs to be rollbacked. - */ - public List collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - Map partitionPathRollbackStatsPairs = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, false); - return new ArrayList<>(partitionPathRollbackStatsPairs.values()); - } - - /** - * May be delete interested files and collect stats or collect stats only. - * - * @param context instance of {@link HoodieEngineContext} to use. - * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested. - * @param rollbackRequests List of {@link ListingBasedRollbackRequest} to be operated on. - * @param doDelete {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes. - * @return stats collected with or w/o actual deletions. - */ - Map maybeDeleteAndCollectStats(HoodieEngineContext context, - HoodieInstant instantToRollback, - List rollbackRequests, - boolean doDelete) { - return context.mapToPair(rollbackRequests, rollbackRequest -> { - switch (rollbackRequest.getType()) { - case DELETE_DATA_FILES_ONLY: { - final Map filesToDeletedStatus = deleteBaseFiles(metaClient, config, instantToRollback.getTimestamp(), - rollbackRequest.getPartitionPath(), doDelete); - return new ImmutablePair<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case DELETE_DATA_AND_LOG_FILES: { - final Map filesToDeletedStatus = deleteBaseAndLogFiles(metaClient, config, instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath(), doDelete); - return new ImmutablePair<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case APPEND_ROLLBACK_BLOCK: { - String fileId = rollbackRequest.getFileId().get(); - String latestBaseInstant = rollbackRequest.getLatestBaseInstant().get(); - - // collect all log files that is supposed to be deleted with this rollback - Map writtenLogFileSizeMap = FSUtils.getAllLogFiles(metaClient.getFs(), - FSUtils.getPartitionPath(config.getBasePath(), rollbackRequest.getPartitionPath()), - fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), latestBaseInstant) - .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen())); - - HoodieLogFormat.Writer writer = null; - try { - writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath())) - .withFileId(fileId) - .overBaseCommit(latestBaseInstant) - .withFs(metaClient.getFs()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - - // generate metadata - if (doDelete) { - Map header = generateHeader(instantToRollback.getTimestamp()); - // if update belongs to an existing log file - writer.appendBlock(new HoodieCommandBlock(header)); - } - } catch (IOException | InterruptedException io) { - throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io); - } finally { - try { - if (writer != null) { - writer.close(); - } - } catch (IOException io) { - throw new HoodieIOException("Error appending rollback block..", io); - } - } - - // This step is intentionally done after writer is closed. Guarantees that - // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in - // cloud-storage : HUDI-168 - Map filesToNumBlocksRollback = Collections.singletonMap( - metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), - 1L - ); - return new ImmutablePair<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withRollbackBlockAppendResults(filesToNumBlocksRollback) - .withWrittenLogFileSizeMap(writtenLogFileSizeMap).build()); - } - default: - throw new IllegalStateException("Unknown Rollback action " + rollbackRequest); - } - }, 0); - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseAndLogFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - LOG.info("Cleaning path " + partitionPath); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - SerializablePathFilter filter = (path) -> { - if (path.toString().endsWith(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } else if (FSUtils.isLogFile(path)) { - // Since the baseCommitTime is the only commit for new log files, it's okay here - String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); - return commit.equals(fileCommitTime); - } - return false; - }; - - final Map results = new HashMap<>(); - FileSystem fs = metaClient.getFs(); - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - final Map results = new HashMap<>(); - LOG.info("Cleaning path " + partitionPath); - FileSystem fs = metaClient.getFs(); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - PathFilter filter = (path) -> { - if (path.toString().contains(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } - return false; - }; - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - private Map generateHeader(String commit) { - // generate metadata - Map header = new HashMap<>(3); - header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); - header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit); - header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, - String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); - return header; - } - - public interface SerializablePathFilter extends PathFilter, Serializable { - - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngrade.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngrade.java deleted file mode 100644 index 67376aef587af..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngrade.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.common.config.ConfigProperty; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.HoodieTableVersion; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieUpgradeDowngradeException; - -import java.io.IOException; -import java.util.Map; - -public class FlinkUpgradeDowngrade extends AbstractUpgradeDowngrade { - public FlinkUpgradeDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context) { - super(metaClient, config, context); - } - - @Override - public void run(HoodieTableMetaClient metaClient, HoodieTableVersion toVersion, HoodieWriteConfig config, - HoodieEngineContext context, String instantTime) { - try { - new FlinkUpgradeDowngrade(metaClient, config, context).run(toVersion, instantTime); - } catch (IOException e) { - throw new HoodieUpgradeDowngradeException("Error during upgrade/downgrade to version:" + toVersion, e); - } - } - - @Override - protected Map upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) { - return new ZeroToOneUpgradeHandler().upgrade(config, context, instantTime); - } else if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.TWO) { - return new OneToTwoUpgradeHandler().upgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true); - } - } - - @Override - protected Map downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.ZERO) { - return new OneToZeroDowngradeHandler().downgrade(config, context, instantTime); - } else if (fromVersion == HoodieTableVersion.TWO && toVersion == HoodieTableVersion.ONE) { - return new TwoToOneDowngradeHandler().downgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false); - } - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java new file mode 100644 index 0000000000000..d097d2e60057c --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.HoodieTable; + +/** + * Flink upgrade and downgrade helper. + */ +public class FlinkUpgradeDowngradeHelper implements BaseUpgradeDowngradeHelper { + + private static final FlinkUpgradeDowngradeHelper SINGLETON_INSTANCE = + new FlinkUpgradeDowngradeHelper(); + + private FlinkUpgradeDowngradeHelper() { + } + + public static FlinkUpgradeDowngradeHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { + return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); + } + + @Override + public String getPartitionColumns(HoodieWriteConfig config) { + return config.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java deleted file mode 100644 index ec8098aa66866..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.table.HoodieTable; - -public class TwoToOneDowngradeHandler extends BaseTwoToOneDowngradeHandler { - @Override - HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { - return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java deleted file mode 100644 index 59e94e557e063..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackHelper; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackRequest; - -import java.util.List; - -/** - * Upgrade handle to assist in upgrading hoodie table from version 0 to 1. - */ -public class ZeroToOneUpgradeHandler extends BaseZeroToOneUpgradeHandler { - - @Override - HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { - return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); - } - - @Override - List getListBasedRollBackStats(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context, Option commitInstantOpt, - List rollbackRequests) { - return new ListingBasedRollbackHelper(metaClient, config) - .collectRollbackStats(context, commitInstantOpt.get(), rollbackRequests); - } -} diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java index 91c5cbd26dec3..7b4e3b675ea05 100644 --- a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java @@ -21,6 +21,8 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.data.HoodieList; +import org.apache.hudi.common.data.HoodieMapPair; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -100,8 +102,8 @@ private HoodieWriteConfig makeConfig(boolean rangePruning, boolean treeFiltering @MethodSource("configParams") public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); - FlinkHoodieBloomIndex index = new FlinkHoodieBloomIndex(config); - HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient); + HoodieBloomIndex index = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient, false); HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA); // Create some partitions, and put some files @@ -165,7 +167,7 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b @MethodSource("configParams") public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); - FlinkHoodieBloomIndex index = new FlinkHoodieBloomIndex(config); + HoodieBloomIndex index = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); final Map> partitionToFileIndexInfo = new HashMap<>(); partitionToFileIndexInfo.put("2017/10/22", @@ -176,14 +178,14 @@ public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolea Map> partitionRecordKeyMap = new HashMap<>(); asList(Pair.of("2017/10/22", "003"), Pair.of("2017/10/22", "002"), Pair.of("2017/10/22", "005"), Pair.of("2017/10/22", "004")) - .forEach(t -> { - List recordKeyList = partitionRecordKeyMap.getOrDefault(t.getLeft(), new ArrayList<>()); - recordKeyList.add(t.getRight()); - partitionRecordKeyMap.put(t.getLeft(), recordKeyList); - }); + .forEach(t -> { + List recordKeyList = partitionRecordKeyMap.getOrDefault(t.getLeft(), new ArrayList<>()); + recordKeyList.add(t.getRight()); + partitionRecordKeyMap.put(t.getLeft(), recordKeyList); + }); - List> comparisonKeyList = - index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyMap); + List> comparisonKeyList = HoodieList.getList( + index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, HoodieMapPair.of(partitionRecordKeyMap))); assertEquals(10, comparisonKeyList.size()); java.util.Map> recordKeyToFileComps = comparisonKeyList.stream() @@ -264,10 +266,10 @@ public void testTagLocationWithEmptyList(boolean rangePruning, boolean treeFilte HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient); // Let's tag - FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); assertDoesNotThrow(() -> { - bloomIndex.tagLocation(records, context, table); + tagLocation(bloomIndex, records, table); }, "EmptyList should not result in IllegalArgumentException: Positive number of slices required"); } @@ -303,8 +305,8 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA); // Let's tag - FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config); - List taggedRecords = bloomIndex.tagLocation(records, context, hoodieTable); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + List taggedRecords = tagLocation(bloomIndex, records, hoodieTable); // Should not find any files for (HoodieRecord record : taggedRecords) { @@ -319,7 +321,7 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean metaClient.reloadActiveTimeline(); // We do the tag again - taggedRecords = bloomIndex.tagLocation(records, context, HoodieFlinkTable.create(config, context, metaClient)); + taggedRecords = tagLocation(bloomIndex, records, HoodieFlinkTable.create(config, context, metaClient)); // Check results for (HoodieRecord record : taggedRecords) { @@ -370,10 +372,10 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA); // Let's tag - FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); List toTagRecords = new ArrayList<>(); toTagRecords.add(new HoodieRecord(record4.getKey(), null)); - List taggedRecords = bloomIndex.tagLocation(toTagRecords, context, hoodieTable); + List taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable); Map>> recordLocations = new HashMap<>(); for (HoodieRecord taggedRecord : taggedRecords) { recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() @@ -398,7 +400,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean taggedRecords.add(new HoodieRecord(key, null)); } - taggedRecords = bloomIndex.tagLocation(toTagRecords1, context, hoodieTable); + taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable); recordLocations.clear(); for (HoodieRecord taggedRecord : taggedRecords) { recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() @@ -452,8 +454,8 @@ public void testBloomFilterFalseError(boolean rangePruning, boolean treeFilterin metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieFlinkTable.create(config, context, metaClient); - FlinkHoodieBloomIndex bloomIndex = new FlinkHoodieBloomIndex(config); - List taggedRecords = bloomIndex.tagLocation(records, context, table); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + List taggedRecords = tagLocation(bloomIndex, records, table); // Check results for (HoodieRecord record : taggedRecords) { diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java index 171bab9fb2a31..054a363168d75 100644 --- a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.FlinkTaskContextSupplier; import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -29,7 +30,9 @@ import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.bloom.TestFlinkHoodieBloomIndex; +import org.apache.hudi.table.HoodieTable; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; @@ -128,6 +131,10 @@ protected void initMetaClient(HoodieTableType tableType) throws IOException { metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); } + protected List tagLocation( + HoodieIndex index, List records, HoodieTable table) { + return HoodieList.getList(index.tagLocation(HoodieList.of(records), context, table)); + } /** * Cleanups file system. diff --git a/hudi-client/hudi-java-client/pom.xml b/hudi-client/hudi-java-client/pom.xml index 8dfb476f0c3fa..af97f68f341e7 100644 --- a/hudi-client/hudi-java-client/pom.xml +++ b/hudi-client/hudi-java-client/pom.xml @@ -121,6 +121,28 @@ junit-platform-commons test + + + org.apache.hadoop + hadoop-hdfs + tests + test + + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java index 57b6306cec0fa..212187b2d7552 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java @@ -20,6 +20,7 @@ import org.apache.hudi.client.common.HoodieJavaEngineContext; import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; @@ -34,7 +35,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.JavaHoodieIndex; +import org.apache.hudi.index.JavaHoodieIndexFactory; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieJavaTable; import org.apache.hudi.table.HoodieTable; @@ -67,14 +68,15 @@ public List> filterExists(List> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible HoodieJavaTable table = HoodieJavaTable.create(config, (HoodieJavaEngineContext) context); Timer.Context indexTimer = metrics.getIndexCtx(); - List> recordsWithLocation = getIndex().tagLocation(hoodieRecords, context, table); + List> recordsWithLocation = HoodieList.getList( + getIndex().tagLocation(HoodieList.of(hoodieRecords), context, table)); metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); return recordsWithLocation.stream().filter(v1 -> !v1.isCurrentLocationKnown()).collect(Collectors.toList()); } @Override - protected HoodieIndex>, List, List> createIndex(HoodieWriteConfig writeConfig) { - return JavaHoodieIndex.createIndex(config); + protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) { + return JavaHoodieIndexFactory.createIndex(config); } @Override @@ -89,7 +91,8 @@ public boolean commit(String instantTime, @Override protected HoodieTable>, List, List> createTable(HoodieWriteConfig config, - Configuration hadoopConf) { + Configuration hadoopConf, + boolean refreshTimeline) { return HoodieJavaTable.create(config, context); } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java index 013e094036b6a..22d4ccabcdd67 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java @@ -19,27 +19,40 @@ package org.apache.hudi.client.common; import org.apache.hadoop.conf.Configuration; + import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieAtomicLongAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.EngineProperty; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.function.SerializableBiFunction; import org.apache.hudi.common.function.SerializableConsumer; import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.stream.Collectors; import java.util.stream.Stream; import static java.util.stream.Collectors.toList; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; /** * A java engine implementation of HoodieEngineContext. @@ -54,11 +67,54 @@ public HoodieJavaEngineContext(Configuration conf, TaskContextSupplier taskConte super(new SerializableConfiguration(conf), taskContextSupplier); } + @Override + public HoodieAccumulator newAccumulator() { + return HoodieAtomicLongAccumulator.create(); + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieList.of(Collections.emptyList()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieList.of(data); + } + @Override public List map(List data, SerializableFunction func, int parallelism) { return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList()); } + @Override + public List mapToPairAndReduceByKey(List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).get()) + .collect(Collectors.toList()); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey(Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() + .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( + Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + .filter(Objects::nonNull); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel() + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList()); diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieBloomIndex.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieBloomIndex.java deleted file mode 100644 index 47d47c8478932..0000000000000 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieBloomIndex.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index; - -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.index.bloom.HoodieBaseBloomIndex; - -/** - * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata. - */ -public class JavaHoodieBloomIndex extends HoodieBaseBloomIndex { - public JavaHoodieBloomIndex(HoodieWriteConfig config) { - super(config); - } -} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java index fc7a451dcf34e..7f8b83f5c7d5d 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java @@ -7,13 +7,14 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.index; @@ -21,12 +22,12 @@ import org.apache.hudi.ApiMaturityLevel; import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.table.HoodieTable; @@ -38,36 +39,33 @@ protected JavaHoodieIndex(HoodieWriteConfig config) { super(config); } - public static HoodieIndex createIndex(HoodieWriteConfig config) { - // first use index class config to create index. - if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { - Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); - if (!(instance instanceof HoodieIndex)) { - throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); - } - return (JavaHoodieIndex) instance; - } - - // TODO more indexes to be added - switch (config.getIndexType()) { - case INMEMORY: - return new JavaInMemoryHashIndex(config); - case BLOOM: - return new JavaHoodieBloomIndex(config); - default: - throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); - } - } - @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List updateLocation(List writeStatuses, HoodieEngineContext context, HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List> tagLocation(List> records, HoodieEngineContext context, HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieList.of(tagLocation(HoodieList.getList(records), context, hoodieTable)); + } + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieList.of(updateLocation(HoodieList.getList(writeStatuses), context, hoodieTable)); + } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java new file mode 100644 index 0000000000000..f6135fb132afa --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index; + +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.ListBasedHoodieBloomIndexHelper; +import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex; + +/** + * A factory to generate Java {@link HoodieIndex}. + */ +public final class JavaHoodieIndexFactory { + + public static HoodieIndex createIndex(HoodieWriteConfig config) { + // first use index class config to create index. + if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { + Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); + if (!(instance instanceof HoodieIndex)) { + throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); + } + return (HoodieIndex) instance; + } + + // TODO more indexes to be added + switch (config.getIndexType()) { + case INMEMORY: + return new HoodieInMemoryHashIndex<>(config); + case BLOOM: + return new HoodieBloomIndex<>(config, ListBasedHoodieBloomIndexHelper.getInstance()); + default: + throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); + } + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java index 06e66a1a02f91..a9e582110c206 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java @@ -24,6 +24,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieJavaEngineContext; @@ -31,15 +32,17 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; -import org.apache.hudi.table.action.clean.JavaCleanActionExecutor; -import org.apache.hudi.table.action.clean.JavaScheduleCleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; import org.apache.hudi.table.action.commit.JavaDeleteCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor; @@ -49,8 +52,9 @@ import org.apache.hudi.table.action.commit.JavaInsertPreppedCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaUpsertCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaUpsertPreppedCommitActionExecutor; -import org.apache.hudi.table.action.restore.JavaCopyOnWriteRestoreActionExecutor; -import org.apache.hudi.table.action.rollback.JavaCopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; import java.util.List; @@ -63,6 +67,11 @@ protected HoodieJavaCopyOnWriteTable(HoodieWriteConfig config, super(config, context, metaClient); } + @Override + public boolean isTableServiceAction(String actionType) { + return !actionType.equals(HoodieTimeline.COMMIT_ACTION); + } + @Override public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, @@ -142,6 +151,11 @@ public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineC context, config, this, instantTime, records).execute(); } + @Override + public void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation) { + throw new HoodieNotSupportedException("update statistics is not supported yet"); + } + @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, @@ -177,24 +191,31 @@ public void rollbackBootstrap(HoodieEngineContext context, throw new HoodieNotSupportedException("RollbackBootstrap is not supported yet"); } + @Override + public Option scheduleRollback(HoodieEngineContext context, String instantTime, HoodieInstant instantToRollback, + boolean skipTimelinePublish) { + return new BaseRollbackPlanActionExecutor(context, config, this, instantTime, instantToRollback, skipTimelinePublish).execute(); + } + @Override public Option scheduleCleaning(HoodieEngineContext context, String instantTime, Option> extraMetadata) { - return new JavaScheduleCleanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); + return new CleanPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); } @Override public HoodieCleanMetadata clean(HoodieEngineContext context, - String cleanInstantTime) { - return new JavaCleanActionExecutor(context, config, this, cleanInstantTime).execute(); + String cleanInstantTime, boolean skipLocking) { + return new CleanActionExecutor(context, config, this, cleanInstantTime).execute(); } @Override public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, - boolean deleteInstants) { - return new JavaCopyOnWriteRollbackActionExecutor( - context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + boolean deleteInstants, + boolean skipLocking) { + return new CopyOnWriteRollbackActionExecutor( + context, config, this, rollbackInstantTime, commitInstant, deleteInstants, skipLocking).execute(); } @Override @@ -210,7 +231,7 @@ public HoodieSavepointMetadata savepoint(HoodieEngineContext context, public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { - return new JavaCopyOnWriteRestoreActionExecutor((HoodieJavaEngineContext) context, - config, this, restoreInstantTime, instantToRestore).execute(); + return new CopyOnWriteRestoreActionExecutor( + context, config, this, restoreInstantTime, instantToRestore).execute(); } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaMergeOnReadTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaMergeOnReadTable.java index 4995af0d6acf2..b219ba1a99016 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaMergeOnReadTable.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaMergeOnReadTable.java @@ -18,14 +18,46 @@ package org.apache.hudi.table; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor; +import org.apache.hudi.table.action.deltacommit.JavaUpsertPreppedDeltaCommitActionExecutor; + +import java.util.List; public class HoodieJavaMergeOnReadTable extends HoodieJavaCopyOnWriteTable { protected HoodieJavaMergeOnReadTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { super(config, context, metaClient); } - // TODO not support yet. + + @Override + public boolean isTableServiceAction(String actionType) { + return !actionType.equals(HoodieTimeline.DELTA_COMMIT_ACTION); + } + + @Override + public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, + String instantTime, + List> preppedRecords) { + return new JavaUpsertPreppedDeltaCommitActionExecutor<>((HoodieJavaEngineContext) context, config, + this, instantTime, preppedRecords).execute(); + + } + + @Override + public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, + String instantTime, + List> preppedRecords, + Option>>> bulkInsertPartitioner) { + return new JavaBulkInsertPreppedCommitActionExecutor((HoodieJavaEngineContext) context, config, + this, instantTime, preppedRecords, bulkInsertPartitioner).execute(); + } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaTable.java index 219dec4e2b19b..8b0a7a95ef87c 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaTable.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaTable.java @@ -29,9 +29,8 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.index.JavaHoodieIndex; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.JavaHoodieIndexFactory; import java.util.List; @@ -56,14 +55,14 @@ public static HoodieJavaTable create(HoodieWr case COPY_ON_WRITE: return new HoodieJavaCopyOnWriteTable<>(config, context, metaClient); case MERGE_ON_READ: - throw new HoodieNotSupportedException("MERGE_ON_READ is not supported yet"); + return new HoodieJavaMergeOnReadTable<>(config, context, metaClient); default: throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } } @Override - protected HoodieIndex>, List, List> getIndex(HoodieWriteConfig config, HoodieEngineContext context) { - return JavaHoodieIndex.createIndex(config); + protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context) { + return JavaHoodieIndexFactory.createIndex(config); } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/clean/JavaCleanActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/clean/JavaCleanActionExecutor.java deleted file mode 100644 index 0ca73d40e84dd..0000000000000 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/clean/JavaCleanActionExecutor.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import org.apache.hudi.avro.model.HoodieActionInstant; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieCleanStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.CleanFileInfo; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.ImmutablePair; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class JavaCleanActionExecutor extends - BaseCleanActionExecutor>, List, List> { - - private static final Logger LOG = LogManager.getLogger(JavaCleanActionExecutor.class); - - public JavaCleanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime) { - super(context, config, table, instantTime); - } - - @Override - List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) { - - Iterator> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream() - .flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile())))).iterator(); - - Stream> partitionCleanStats = - deleteFilesFunc(filesToBeDeletedPerPartition, table) - .collect(Collectors.groupingBy(Pair::getLeft)) - .entrySet().stream() - .map(x -> new ImmutablePair(x.getKey(), x.getValue().stream().map(y -> y.getRight()).reduce(PartitionCleanStat::merge).get())); - - Map partitionCleanStatsMap = partitionCleanStats - .collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); - - // Return PartitionCleanStat for each partition passed. - return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> { - PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) - ? partitionCleanStatsMap.get(partitionPath) - : new PartitionCleanStat(partitionPath); - HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain(); - return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath) - .withEarliestCommitRetained(Option.ofNullable( - actionInstant != null - ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), - actionInstant.getAction(), actionInstant.getTimestamp()) - : null)) - .withDeletePathPattern(partitionCleanStat.deletePathPatterns()) - .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()) - .withFailedDeletes(partitionCleanStat.failedDeleteFiles()) - .withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()) - .withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()) - .withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()) - .build(); - }).collect(Collectors.toList()); - } - - private static Stream> deleteFilesFunc(Iterator> iter, HoodieTable table) { - Map partitionCleanStatMap = new HashMap<>(); - FileSystem fs = table.getMetaClient().getFs(); - - while (iter.hasNext()) { - Pair partitionDelFileTuple = iter.next(); - String partitionPath = partitionDelFileTuple.getLeft(); - Path deletePath = new Path(partitionDelFileTuple.getRight().getFilePath()); - String deletePathStr = deletePath.toString(); - Boolean deletedFileResult = null; - try { - deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); - } catch (IOException e) { - LOG.error("Delete file failed"); - } - if (!partitionCleanStatMap.containsKey(partitionPath)) { - partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); - } - boolean isBootstrapBasePathFile = partitionDelFileTuple.getRight().isBootstrapBaseFile(); - PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); - if (isBootstrapBasePathFile) { - // For Bootstrap Base file deletions, store the full file path. - partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true); - partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true); - } else { - partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false); - partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false); - } - } - return partitionCleanStatMap.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())); - } -} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/clean/JavaScheduleCleanActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/clean/JavaScheduleCleanActionExecutor.java deleted file mode 100644 index 05d19a63ef160..0000000000000 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/clean/JavaScheduleCleanActionExecutor.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import java.util.List; -import java.util.Map; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -public class JavaScheduleCleanActionExecutor extends - BaseCleanPlanActionExecutor>, List, List> { - - private static final Logger LOG = LogManager.getLogger(JavaScheduleCleanActionExecutor.class); - - public JavaScheduleCleanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } - - @Override - protected Option createCleanerPlan() { - return super.execute(); - } -} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java index aec84a50e18af..66cb40758bdc0 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -128,7 +129,8 @@ public HoodieWriteMetadata> execute(List> inpu protected void updateIndex(List writeStatuses, HoodieWriteMetadata> result) { Instant indexStartTime = Instant.now(); // Update the index back - List statuses = table.getIndex().updateLocation(writeStatuses, context, table); + List statuses = HoodieList.getList( + table.getIndex().updateLocation(HoodieList.of(writeStatuses), context, table)); result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); result.setWriteStatuses(statuses); } @@ -206,6 +208,8 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + writeTableMetadata(metadata, actionType); + activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); LOG.info("Committed " + instantTime); @@ -327,7 +331,8 @@ public Partitioner getInsertPartitioner(WorkloadProfile profile) { public void updateIndexAndCommitIfNeeded(List writeStatuses, HoodieWriteMetadata result) { Instant indexStartTime = Instant.now(); // Update the index back - List statuses = table.getIndex().updateLocation(writeStatuses, context, table); + List statuses = HoodieList.getList( + table.getIndex().updateLocation(HoodieList.of(writeStatuses), context, table)); result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); result.setWriteStatuses(statuses); result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(result)); diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java index a907c9fd9760e..fc81b787f4737 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; import org.apache.hudi.common.model.HoodieKey; @@ -97,8 +98,8 @@ public HoodieWriteMetadata> execute(String instantTime, dedupedKeys.stream().map(key -> new HoodieRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); Instant beginTag = Instant.now(); // perform index look up to get existing location of records - List> taggedRecords = - table.getIndex().tagLocation(dedupedRecords, context, table); + List> taggedRecords = HoodieList.getList( + table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table)); Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); // filter out non existent keys/records diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java index 1170f2f4eac4c..a52ab6e0f3d0c 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java @@ -52,8 +52,7 @@ protected List getAllExistingFileIds(String partitionPath) { protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeResult) { Map> partitionToExistingFileIds = new HashMap<>(); List partitionPaths = FSUtils.getAllPartitionPaths(context, - table.getMetaClient().getBasePath(), config.isMetadataTableEnabled(), - config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()); + table.getMetaClient().getBasePath(), config.isMetadataTableEnabled(), config.shouldAssumeDatePartitioning()); if (partitionPaths != null && partitionPaths.size() > 0) { partitionToExistingFileIds = context.mapToPair(partitionPaths, diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java index ec7ea1641a442..8af7707ea2f98 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java @@ -19,11 +19,14 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieList; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieTable; import java.util.List; import java.util.Map; @@ -45,9 +48,14 @@ public static JavaWriteHelper newInstance() { } @Override - public List> deduplicateRecords(List> records, - HoodieIndex>, List, List> index, - int parallelism) { + protected List> tag(List> dedupedRecords, HoodieEngineContext context, HoodieTable>, List, List> table) { + return HoodieList.getList( + table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table)); + } + + @Override + public List> deduplicateRecords( + List> records, HoodieIndex index, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); Map>>> keyedRecords = records.stream().map(record -> { HoodieKey hoodieKey = record.getKey(); diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseJavaDeltaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseJavaDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..0b4a654074408 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseJavaDeltaCommitActionExecutor.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.deltacommit; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.BaseJavaCommitActionExecutor; + +public abstract class BaseJavaDeltaCommitActionExecutor> extends BaseJavaCommitActionExecutor { + + public BaseJavaDeltaCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, WriteOperationType operationType) { + super(context, config, table, instantTime, operationType); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/JavaUpsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/JavaUpsertPreppedDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..f6faa28bbb1ef --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/JavaUpsertPreppedDeltaCommitActionExecutor.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.deltacommit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.io.HoodieAppendHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.JavaBulkInsertHelper; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; + +public class JavaUpsertPreppedDeltaCommitActionExecutor> extends BaseJavaDeltaCommitActionExecutor { + + private static final Logger LOG = LogManager.getLogger(JavaUpsertPreppedDeltaCommitActionExecutor.class); + + private final List> preppedInputRecords; + + public JavaUpsertPreppedDeltaCommitActionExecutor(HoodieJavaEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, List> preppedInputRecords) { + super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); + this.preppedInputRecords = preppedInputRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); + // First group by target file id. + HashMap, List>> recordsByFileId = new HashMap<>(); + List> insertedRecords = new LinkedList<>(); + + // Split records into inserts and updates. + for (HoodieRecord record : preppedInputRecords) { + if (!record.isCurrentLocationKnown()) { + insertedRecords.add(record); + } else { + Pair fileIdPartitionPath = Pair.of(record.getCurrentLocation().getFileId(), record.getPartitionPath()); + if (!recordsByFileId.containsKey(fileIdPartitionPath)) { + recordsByFileId.put(fileIdPartitionPath, new LinkedList<>()); + } + recordsByFileId.get(fileIdPartitionPath).add(record); + } + } + LOG.info(String.format("Total update fileIDs %s, total inserts %s for commit %s", + recordsByFileId.size(), insertedRecords.size(), instantTime)); + + List allWriteStatuses = new ArrayList<>(); + try { + recordsByFileId.forEach((k, v) -> { + HoodieAppendHandle appendHandle = new HoodieAppendHandle(config, instantTime, table, + k.getRight(), k.getLeft(), v.iterator(), taskContextSupplier); + appendHandle.doAppend(); + allWriteStatuses.addAll(appendHandle.close()); + }); + + if (insertedRecords.size() > 0) { + HoodieWriteMetadata> insertResult = JavaBulkInsertHelper.newInstance() + .bulkInsert(insertedRecords, instantTime, table, config, this, false, Option.empty()); + allWriteStatuses.addAll(insertResult.getWriteStatuses()); + } + } catch (Throwable e) { + if (e instanceof HoodieUpsertException) { + throw e; + } + throw new HoodieUpsertException("Failed to upsert for commit time " + instantTime, e); + } + + updateIndex(allWriteStatuses, result); + return result; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaCopyOnWriteRollbackActionExecutor.java deleted file mode 100644 index 15e393220f083..0000000000000 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaCopyOnWriteRollbackActionExecutor.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class JavaCopyOnWriteRollbackActionExecutor extends - BaseCopyOnWriteRollbackActionExecutor>, List, List> { - public JavaCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public JavaCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new JavaMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant instantToRollback) { - List rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW( - context, table.getMetaClient().getBasePath(), config); - return new JavaListingBasedRollbackHelper(table.getMetaClient(), config) - .performRollback(context, instantToRollback, rollbackRequests); - } -} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaListingBasedRollbackHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaListingBasedRollbackHelper.java deleted file mode 100644 index 5331ca5891c28..0000000000000 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaListingBasedRollbackHelper.java +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.collection.ImmutablePair; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieRollbackException; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.PathFilter; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; - -/** - * Performs Rollback of Hoodie Tables. - */ -public class JavaListingBasedRollbackHelper implements Serializable { - - private static final Logger LOG = LogManager.getLogger(JavaListingBasedRollbackHelper.class); - - private final HoodieTableMetaClient metaClient; - private final HoodieWriteConfig config; - - public JavaListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { - this.metaClient = metaClient; - this.config = config; - } - - /** - * Performs all rollback actions that we have collected in parallel. - */ - public List performRollback(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - Map partitionPathRollbackStatsPairs = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, true); - - Map>> collect = partitionPathRollbackStatsPairs.entrySet() - .stream() - .map(x -> Pair.of(x.getKey(), x.getValue())).collect(Collectors.groupingBy(Pair::getLeft)); - return collect.values().stream() - .map(pairs -> pairs.stream().map(Pair::getRight).reduce(RollbackUtils::mergeRollbackStat).orElse(null)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } - - /** - * Collect all file info that needs to be rollbacked. - */ - public List collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - Map partitionPathRollbackStatsPairs = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, false); - return new ArrayList<>(partitionPathRollbackStatsPairs.values()); - } - - /** - * May be delete interested files and collect stats or collect stats only. - * - * @param context instance of {@link HoodieEngineContext} to use. - * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested. - * @param rollbackRequests List of {@link ListingBasedRollbackRequest} to be operated on. - * @param doDelete {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes. - * @return stats collected with or w/o actual deletions. - */ - Map maybeDeleteAndCollectStats(HoodieEngineContext context, - HoodieInstant instantToRollback, - List rollbackRequests, - boolean doDelete) { - return context.mapToPair(rollbackRequests, rollbackRequest -> { - switch (rollbackRequest.getType()) { - case DELETE_DATA_FILES_ONLY: { - final Map filesToDeletedStatus = deleteBaseFiles(metaClient, config, instantToRollback.getTimestamp(), - rollbackRequest.getPartitionPath(), doDelete); - return new ImmutablePair<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case DELETE_DATA_AND_LOG_FILES: { - final Map filesToDeletedStatus = deleteBaseAndLogFiles(metaClient, config, instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath(), doDelete); - return new ImmutablePair<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case APPEND_ROLLBACK_BLOCK: { - HoodieLogFormat.Writer writer = null; - try { - writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath())) - .withFileId(rollbackRequest.getFileId().get()) - .overBaseCommit(rollbackRequest.getLatestBaseInstant().get()).withFs(metaClient.getFs()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - - // generate metadata - if (doDelete) { - Map header = generateHeader(instantToRollback.getTimestamp()); - // if update belongs to an existing log file - writer.appendBlock(new HoodieCommandBlock(header)); - } - } catch (IOException | InterruptedException io) { - throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io); - } finally { - try { - if (writer != null) { - writer.close(); - } - } catch (IOException io) { - throw new HoodieIOException("Error appending rollback block..", io); - } - } - - // This step is intentionally done after writer is closed. Guarantees that - // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in - // cloud-storage : HUDI-168 - Map filesToNumBlocksRollback = Collections.singletonMap( - metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), 1L - ); - return new ImmutablePair<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withRollbackBlockAppendResults(filesToNumBlocksRollback).build()); - } - default: - throw new IllegalStateException("Unknown Rollback action " + rollbackRequest); - } - }, 0); - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseAndLogFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - LOG.info("Cleaning path " + partitionPath); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - SerializablePathFilter filter = (path) -> { - if (path.toString().endsWith(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } else if (FSUtils.isLogFile(path)) { - // Since the baseCommitTime is the only commit for new log files, it's okay here - String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); - return commit.equals(fileCommitTime); - } - return false; - }; - - final Map results = new HashMap<>(); - FileSystem fs = metaClient.getFs(); - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - final Map results = new HashMap<>(); - LOG.info("Cleaning path " + partitionPath); - FileSystem fs = metaClient.getFs(); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - PathFilter filter = (path) -> { - if (path.toString().contains(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } - return false; - }; - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - private Map generateHeader(String commit) { - // generate metadata - Map header = new HashMap<>(3); - header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); - header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit); - header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, - String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); - return header; - } - - public interface SerializablePathFilter extends PathFilter, Serializable { - - } -} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaMarkerBasedRollbackStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaMarkerBasedRollbackStrategy.java deleted file mode 100644 index 150f663cf44f5..0000000000000 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/rollback/JavaMarkerBasedRollbackStrategy.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.marker.MarkerBasedRollbackUtils; -import org.apache.hudi.table.marker.WriteMarkers; - -import java.util.List; -import java.util.stream.Collectors; - -@SuppressWarnings("checkstyle:LineLength") -public class JavaMarkerBasedRollbackStrategy extends AbstractMarkerBasedRollbackStrategy>, List, List> { - public JavaMarkerBasedRollbackStrategy(HoodieTable>, List, List> table, - HoodieEngineContext context, - HoodieWriteConfig config, - String instantTime) { - super(table, context, config, instantTime); - } - - @Override - public List execute(HoodieInstant instantToRollback) { - try { - List markerPaths = MarkerBasedRollbackUtils.getAllMarkerPaths( - table, context, instantToRollback.getTimestamp(), config.getRollbackParallelism()); - List rollbackStats = context.map(markerPaths, markerFilePath -> { - String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1); - IOType type = IOType.valueOf(typeStr); - switch (type) { - case MERGE: - return undoMerge(WriteMarkers.stripMarkerSuffix(markerFilePath)); - case APPEND: - return undoAppend(WriteMarkers.stripMarkerSuffix(markerFilePath), instantToRollback); - case CREATE: - return undoCreate(WriteMarkers.stripMarkerSuffix(markerFilePath)); - default: - throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); - } - }, 0); - - return rollbackStats.stream().map(rollbackStat -> Pair.of(rollbackStat.getPartitionPath(), rollbackStat)) - .collect(Collectors.groupingBy(Pair::getKey)) - .values() - .stream() - .map(x -> x.stream().map(y -> y.getValue()).reduce(RollbackUtils::mergeRollbackStat).get()) - .collect(Collectors.toList()); - } catch (Exception e) { - throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); - } - } -} diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java index 796d7b74a83c5..4a3f3d5bcef89 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.HoodieJavaWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieKey; @@ -114,7 +115,8 @@ private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() { return HoodieWriteConfig.newBuilder() .withEngineType(EngineType.JAVA) .withPath(basePath) - .withSchema(SCHEMA.toString()); + .withSchema(SCHEMA.toString()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()); } @Test diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java index fc615e0f821cc..84040f906ce32 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java @@ -32,9 +32,10 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; @@ -57,7 +58,7 @@ /** * Provides an RDD based API for accessing/filtering Hoodie tables, based on keys. */ -public class HoodieReadClient implements Serializable { +public class HoodieReadClient> implements Serializable { private static final long serialVersionUID = 1L; @@ -65,7 +66,7 @@ public class HoodieReadClient implements Serializ * TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple * basepath pointing to the table. Until, then just always assume a BloomIndex */ - private final transient HoodieIndex>, JavaRDD, JavaRDD> index; + private final transient HoodieIndex index; private HoodieTable>, JavaRDD, JavaRDD> hoodieTable; private transient Option sqlContextOpt; private final transient HoodieSparkEngineContext context; @@ -100,7 +101,7 @@ public HoodieReadClient(HoodieSparkEngineContext context, HoodieWriteConfig clie // Create a Hoodie table which encapsulated the commits and files visible HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); this.hoodieTable = HoodieSparkTable.create(clientConfig, context, metaClient); - this.index = SparkHoodieIndex.createIndex(clientConfig); + this.index = SparkHoodieIndexFactory.createIndex(clientConfig); this.sqlContextOpt = Option.empty(); } @@ -170,7 +171,9 @@ public Dataset readROView(JavaRDD hoodieKeys, int parallelism) { * component (without scheme) of the URI underlying file */ public JavaPairRDD>> checkExists(JavaRDD hoodieKeys) { - return index.tagLocation(hoodieKeys.map(k -> new HoodieRecord<>(k, null)), context, hoodieTable) + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(hoodieKeys.map(k -> new HoodieRecord<>(k, null))), + context, hoodieTable)) .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) : Option.empty()) @@ -196,7 +199,8 @@ public JavaRDD> filterExists(JavaRDD> hoodieReco * @return Tagged RDD of Hoodie records */ public JavaRDD> tagLocation(JavaRDD> hoodieRecords) throws HoodieIndexException { - return index.tagLocation(hoodieRecords, context, hoodieTable); + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(hoodieRecords), context, hoodieTable)); } /** diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java index 884b555447897..16e54a21551c6 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java @@ -19,16 +19,20 @@ package org.apache.hudi.client; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import java.io.IOException; +import java.util.stream.Stream; /** * Async clustering client for Spark datasource. @@ -47,8 +51,11 @@ public HoodieSparkClusteringClient( public void cluster(HoodieInstant instant) throws IOException { LOG.info("Executing clustering instance " + instant); SparkRDDWriteClient writeClient = (SparkRDDWriteClient) clusteringClient; - JavaRDD res = writeClient.cluster(instant.getTimestamp(), true).getWriteStatuses(); - if (res != null && res.collect().stream().anyMatch(WriteStatus::hasErrors)) { + Option commitMetadata = writeClient.cluster(instant.getTimestamp(), true).getCommitMetadata(); + Stream hoodieWriteStatStream = commitMetadata.get().getPartitionToWriteStats().entrySet().stream().flatMap(e -> + e.getValue().stream()); + long errorsCount = hoodieWriteStatStream.mapToLong(HoodieWriteStat::getTotalWriteErrors).sum(); + if (errorsCount > 0) { // TODO: Should we treat this fatal and throw exception? LOG.error("Clustering for instant (" + instant + ") failed with write errors"); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java index 79601f8fa1dd9..4154dc152c74d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java @@ -40,11 +40,11 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieCommitException; -import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.metrics.DistributedRegistry; @@ -52,10 +52,10 @@ import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.compact.SparkCompactHelpers; +import org.apache.hudi.table.action.compact.CompactHelpers; import org.apache.hudi.table.marker.WriteMarkersFactory; -import org.apache.hudi.table.upgrade.AbstractUpgradeDowngrade; -import org.apache.hudi.table.upgrade.SparkUpgradeDowngrade; +import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; import com.codahale.metrics.Timer; import org.apache.hadoop.conf.Configuration; @@ -79,7 +79,7 @@ public class SparkRDDWriteClient extends private static final Logger LOG = LogManager.getLogger(SparkRDDWriteClient.class); public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { - super(context, clientConfig); + this(context, clientConfig, Option.empty()); } @Deprecated @@ -96,6 +96,11 @@ public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeC public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, Option timelineService) { super(context, writeConfig, timelineService); + if (config.isMetadataTableEnabled()) { + // If the metadata table does not exist, it should be bootstrapped here + // TODO: Check if we can remove this requirement - auto bootstrap on commit + SparkHoodieBackedTableMetadataWriter.create(context.getHadoopConf().get(), config, context); + } } /** @@ -110,8 +115,8 @@ public static SparkConf registerClasses(SparkConf conf) { } @Override - protected HoodieIndex>, JavaRDD, JavaRDD> createIndex(HoodieWriteConfig writeConfig) { - return SparkHoodieIndex.createIndex(config); + protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) { + return SparkHoodieIndexFactory.createIndex(config); } /** @@ -126,8 +131,9 @@ public boolean commit(String instantTime, JavaRDD writeStatuses, Op @Override protected HoodieTable>, JavaRDD, JavaRDD> createTable(HoodieWriteConfig config, - Configuration hadoopConf) { - return HoodieSparkTable.create(config, context); + Configuration hadoopConf, + boolean refreshTimeline) { + return HoodieSparkTable.create(config, context, refreshTimeline); } @Override @@ -135,7 +141,8 @@ public JavaRDD> filterExists(JavaRDD> hoodieReco // Create a Hoodie table which encapsulated the commits and files visible HoodieSparkTable table = HoodieSparkTable.create(config, context); Timer.Context indexTimer = metrics.getIndexCtx(); - JavaRDD> recordsWithLocation = getIndex().tagLocation(hoodieRecords, context, table); + JavaRDD> recordsWithLocation = HoodieJavaRDD.getJavaRDD( + getIndex().tagLocation(HoodieJavaRDD.of(hoodieRecords), context, table)); metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); } @@ -286,8 +293,8 @@ protected JavaRDD postWrite(HoodieWriteMetadata writeStatuses, Option> extraMetadata) throws IOException { HoodieSparkTable table = HoodieSparkTable.create(config, context); - HoodieCommitMetadata metadata = SparkCompactHelpers.newInstance().createCompactionMetadata( - table, compactionInstantTime, writeStatuses, config.getSchema()); + HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata( + table, compactionInstantTime, HoodieJavaRDD.of(writeStatuses), config.getSchema()); extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata)); completeCompaction(metadata, writeStatuses, table, compactionInstantTime); } @@ -298,14 +305,17 @@ protected void completeCompaction(HoodieCommitMetadata metadata, JavaRDD writeStats = writeStatuses.map(WriteStatus::getStat).collect(); + writeTableMetadata(table, metadata, new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionCommitTime)); + // commit to data table after committing to metadata table. finalizeWrite(table, compactionCommitTime, writeStats); LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata); - SparkCompactHelpers.newInstance().completeInflightCompaction(table, compactionCommitTime, metadata); - + CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata); + WriteMarkersFactory.get(config.getMarkersType(), table, compactionCommitTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); if (compactionTimer != null) { long durationInMs = metrics.getDurationInMs(compactionTimer.stop()); try { - metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(), + metrics.updateCommitMetrics(HoodieActiveTimeline.parseInstantTime(compactionCommitTime).getTime(), durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION); } catch (ParseException e) { throw new HoodieCommitException("Commit time is not of valid format. Failed to commit compaction " @@ -317,16 +327,17 @@ protected void completeCompaction(HoodieCommitMetadata metadata, JavaRDD compact(String compactionInstantTime, boolean shouldComplete) { - HoodieSparkTable table = HoodieSparkTable.create(config, context); + HoodieSparkTable table = HoodieSparkTable.create(config, context, true); preWrite(compactionInstantTime, WriteOperationType.COMPACT, table.getMetaClient()); HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); if (pendingCompactionTimeline.containsInstant(inflightInstant)) { - rollbackInflightCompaction(inflightInstant, table); + table.rollbackInflightCompaction(inflightInstant); table.getMetaClient().reloadActiveTimeline(); } compactionTimer = metrics.getCompactionCtx(); - HoodieWriteMetadata> compactionMetadata = table.compact(context, compactionInstantTime); + HoodieWriteMetadata> compactionMetadata = + table.compact(context, compactionInstantTime); JavaRDD statuses = compactionMetadata.getWriteStatuses(); if (shouldComplete && compactionMetadata.getCommitMetadata().isPresent()) { completeTableService(TableServiceType.COMPACT, compactionMetadata.getCommitMetadata().get(), statuses, table, compactionInstantTime); @@ -336,7 +347,7 @@ protected JavaRDD compact(String compactionInstantTime, boolean sho @Override public HoodieWriteMetadata> cluster(String clusteringInstant, boolean shouldComplete) { - HoodieSparkTable table = HoodieSparkTable.create(config, context); + HoodieSparkTable table = HoodieSparkTable.create(config, context, config.isMetadataTableEnabled()); preWrite(clusteringInstant, WriteOperationType.CLUSTER, table.getMetaClient()); HoodieTimeline pendingClusteringTimeline = table.getActiveTimeline().filterPendingReplaceTimeline(); HoodieInstant inflightInstant = HoodieTimeline.getReplaceCommitInflightInstant(clusteringInstant); @@ -366,21 +377,26 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, JavaRDD s.getTotalWriteErrors() > 0L).map(s -> s.getFileId()).collect(Collectors.joining(","))); } + writeTableMetadata(table, metadata, new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringCommitTime)); finalizeWrite(table, clusteringCommitTime, writeStats); try { + // try to save statistics info to hudi + if (config.isDataSkippingEnabled() && config.isLayoutOptimizationEnabled() && !config.getClusteringSortColumns().isEmpty()) { + table.updateStatistics(context, writeStats, clusteringCommitTime, true); + } LOG.info("Committing Clustering " + clusteringCommitTime + ". Finished with result " + metadata); table.getActiveTimeline().transitionReplaceInflightToComplete( HoodieTimeline.getReplaceCommitInflightInstant(clusteringCommitTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); } catch (IOException e) { - throw new HoodieClusteringException("unable to transition clustering inflight to complete: " + clusteringCommitTime, e); + throw new HoodieClusteringException("unable to transition clustering inflight to complete: " + clusteringCommitTime, e); } WriteMarkersFactory.get(config.getMarkersType(), table, clusteringCommitTime) .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); if (clusteringTimer != null) { long durationInMs = metrics.getDurationInMs(clusteringTimer.stop()); try { - metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(clusteringCommitTime).getTime(), + metrics.updateCommitMetrics(HoodieActiveTimeline.parseInstantTime(clusteringCommitTime).getTime(), durationInMs, metadata, HoodieActiveTimeline.REPLACE_COMMIT_ACTION); } catch (ParseException e) { throw new HoodieCommitException("Commit time is not of valid format. Failed to commit compaction " @@ -390,24 +406,40 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, JavaRDD>, JavaRDD, JavaRDD> table, HoodieCommitMetadata commitMetadata, + HoodieInstant hoodieInstant) { + try { + this.txnManager.beginTransaction(Option.of(hoodieInstant), Option.empty()); + boolean isTableServiceAction = table.isTableServiceAction(hoodieInstant.getAction()); + // Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a + // single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition. + table.getMetadataWriter().ifPresent(w -> w.update(commitMetadata, hoodieInstant.getTimestamp(), isTableServiceAction)); + } finally { + this.txnManager.endTransaction(); + } + } + @Override protected HoodieTable>, JavaRDD, JavaRDD> getTableAndInitCtx(WriteOperationType operationType, String instantTime) { HoodieTableMetaClient metaClient = createMetaClient(true); - AbstractUpgradeDowngrade upgradeDowngrade = new SparkUpgradeDowngrade(metaClient, config, context); + UpgradeDowngrade upgradeDowngrade = new UpgradeDowngrade( + metaClient, config, context, SparkUpgradeDowngradeHelper.getInstance()); if (upgradeDowngrade.needsUpgradeOrDowngrade(HoodieTableVersion.current())) { if (config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { this.txnManager.beginTransaction(); try { // Ensure no inflight commits by setting EAGER policy and explicitly cleaning all failed commits - this.rollbackFailedWrites(getInstantsToRollback(metaClient, HoodieFailedWritesCleaningPolicy.EAGER)); - new SparkUpgradeDowngrade(metaClient, config, context) - .run(metaClient, HoodieTableVersion.current(), config, context, instantTime); + this.rollbackFailedWrites(getInstantsToRollback(metaClient, HoodieFailedWritesCleaningPolicy.EAGER, Option.of(instantTime)), true); + new UpgradeDowngrade( + metaClient, config, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.current(), instantTime); } finally { this.txnManager.endTransaction(); } } else { - upgradeDowngrade.run(metaClient, HoodieTableVersion.current(), config, context, instantTime); + upgradeDowngrade.run(HoodieTableVersion.current(), instantTime); } + metaClient.reloadActiveTimeline(); } metaClient.validateTableProperties(config.getProps(), operationType); return getTableAndInitCtx(metaClient, operationType, instantTime); @@ -436,37 +468,25 @@ private HoodieTable>, JavaRDD, JavaRDD table = HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient); + HoodieSparkTable table = HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient, config.isMetadataTableEnabled()); if (table.getMetaClient().getCommitActionType().equals(HoodieTimeline.COMMIT_ACTION)) { writeTimer = metrics.getCommitCtx(); } else { writeTimer = metrics.getDeltaCommitCtx(); } + table.getHoodieView().sync(); return table; } @Override - public void syncTableMetadata() { - if (!config.getMetadataConfig().enableSync()) { - LOG.info("Metadata table sync is disabled in the config."); - return; - } - - // Open up the metadata table again, for syncing - try (HoodieTableMetadataWriter writer = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context)) { - LOG.info("Successfully synced to metadata table"); - } catch (Exception e) { - throw new HoodieMetadataException("Error syncing to metadata table.", e); - } - } - - @Override - protected void preCommit(String instantTime, HoodieCommitMetadata metadata) { + protected void preCommit(HoodieInstant inflightInstant, HoodieCommitMetadata metadata) { // Create a Hoodie table after startTxn which encapsulated the commits and files visible. // Important to create this after the lock to ensure latest commits show up in the timeline without need for reload HoodieTable table = createTable(config, hadoopConf); TransactionUtils.resolveWriteConflictIfAny(table, this.txnManager.getCurrentTransactionOwner(), Option.of(metadata), config, txnManager.getLastCompletedTransactionOwner()); + table.getMetadataWriter().ifPresent(w -> ((HoodieTableMetadataWriter)w).update(metadata, inflightInstant.getTimestamp(), + table.isTableServiceAction(inflightInstant.getAction()))); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java index 6e82f42411e0e..1d2b4e0edaa1a 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java @@ -18,25 +18,35 @@ package org.apache.hudi.client.bootstrap; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.Path; import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.common.bootstrap.FileStatusUtils; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; import org.apache.parquet.schema.MessageType; import org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.types.StructType; +import java.io.IOException; import java.util.List; import java.util.Objects; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + public class HoodieSparkBootstrapSchemaProvider extends HoodieBootstrapSchemaProvider { public HoodieSparkBootstrapSchemaProvider(HoodieWriteConfig writeConfig) { super(writeConfig); @@ -44,16 +54,24 @@ public HoodieSparkBootstrapSchemaProvider(HoodieWriteConfig writeConfig) { @Override protected Schema getBootstrapSourceSchema(HoodieEngineContext context, List>> partitions) { - MessageType parquetSchema = partitions.stream().flatMap(p -> p.getValue().stream()).map(fs -> { - try { - Path filePath = FileStatusUtils.toPath(fs.getPath()); - return new ParquetUtils().readSchema(context.getHadoopConf().get(), filePath); - } catch (Exception ex) { - return null; - } - }).filter(Objects::nonNull).findAny() + Schema schema = partitions.stream().flatMap(p -> p.getValue().stream()).map(fs -> { + Path filePath = FileStatusUtils.toPath(fs.getPath()); + String extension = FSUtils.getFileExtension(filePath.getName()); + if (PARQUET.getFileExtension().equals(extension)) { + return getBootstrapSourceSchemaParquet(writeConfig, context, filePath); + } else if (ORC.getFileExtension().equals(extension)) { + return getBootstrapSourceSchemaOrc(writeConfig, context, filePath); + } else { + throw new HoodieException("Could not determine schema from the data files."); + } + } + ).filter(Objects::nonNull).findAny() .orElseThrow(() -> new HoodieException("Could not determine schema from the data files.")); + return schema; + } + private static Schema getBootstrapSourceSchemaParquet(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) { + MessageType parquetSchema = new ParquetUtils().readSchema(context.getHadoopConf().get(), filePath); ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter( Boolean.parseBoolean(SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()), @@ -65,4 +83,19 @@ protected Schema getBootstrapSourceSchema(HoodieEngineContext context, List performClusteringWithRecordsRDD(final JavaRDD> getPartitioner(Map strategyParams, Schema schema) { - if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { + if (getWriteConfig().isLayoutOptimizationEnabled()) { + // sort input records by z-order/hilbert + return Option.of(new RDDSpatialCurveOptimizationSortPartitioner((HoodieSparkEngineContext) getEngineContext(), + getWriteConfig(), HoodieAvroUtils.addMetadataFields(schema))); + } else if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { return Option.of(new RDDCustomColumnsSortPartitioner(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","), HoodieAvroUtils.addMetadataFields(schema))); } else { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java index d869ec77a720b..cc29ef70f5dd4 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java @@ -20,22 +20,32 @@ import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.EngineProperty; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.function.SerializableBiFunction; import org.apache.hudi.common.function.SerializableConsumer; import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.data.HoodieSparkLongAccumulator; import org.apache.hudi.exception.HoodieException; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.sql.SQLContext; import scala.Tuple2; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.stream.Collectors; import java.util.stream.Stream; /** @@ -68,11 +78,57 @@ public static JavaSparkContext getSparkContext(HoodieEngineContext context) { return ((HoodieSparkEngineContext) context).getJavaSparkContext(); } + @Override + public HoodieAccumulator newAccumulator() { + HoodieSparkLongAccumulator accumulator = HoodieSparkLongAccumulator.create(); + javaSparkContext.sc().register(accumulator.getAccumulator()); + return accumulator; + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieJavaRDD.of(javaSparkContext.emptyRDD()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieJavaRDD.of(javaSparkContext.parallelize(data, parallelism)); + } + @Override public List map(List data, SerializableFunction func, int parallelism) { return javaSparkContext.parallelize(data, parallelism).map(func::apply).collect(); } + @Override + public List mapToPairAndReduceByKey(List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return javaSparkContext.parallelize(data, parallelism).mapToPair(input -> { + Pair pair = mapToPairFunc.call(input); + return new Tuple2<>(pair.getLeft(), pair.getRight()); + }).reduceByKey(reduceFunc::apply).map(Tuple2::_2).collect(); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return javaSparkContext.parallelize(data.collect(Collectors.toList()), parallelism) + .mapPartitionsToPair((PairFlatMapFunction, K, V>) iterator -> + flatMapToPairFunc.call(iterator).collect(Collectors.toList()).stream() + .map(e -> new Tuple2<>(e.getKey(), e.getValue())).iterator() + ) + .reduceByKey(reduceFunc::apply) + .map(e -> new ImmutablePair<>(e._1, e._2)) + .collect().stream(); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return javaSparkContext.parallelize(data, parallelism).mapToPair(pair -> new Tuple2(pair.getLeft(), pair.getRight())) + .reduceByKey(reduceFunc::apply).map(Tuple2::_2).collect(); + } + @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { return javaSparkContext.parallelize(data, parallelism).flatMap(x -> func.apply(x).iterator()).collect(); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java index 58d822a23d20b..9cb127f397b20 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java @@ -19,7 +19,6 @@ package org.apache.hudi.client.utils; import org.apache.hudi.common.config.HoodieConfig; -import org.apache.hudi.config.HoodieIndexConfig; import org.apache.spark.storage.StorageLevel; @@ -34,12 +33,4 @@ public class SparkMemoryUtils { public static StorageLevel getWriteStatusStorageLevel(Properties properties) { return StorageLevel.fromString(new HoodieConfig(properties).getString(WRITE_STATUS_STORAGE_LEVEL_VALUE)); } - - public static StorageLevel getBloomIndexInputStorageLevel(Properties properties) { - return StorageLevel.fromString(new HoodieConfig(properties).getString(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE)); - } - - public static StorageLevel getSimpleIndexInputStorageLevel(Properties properties) { - return StorageLevel.fromString(new HoodieConfig(properties).getString(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE)); - } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaPairRDD.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaPairRDD.java new file mode 100644 index 0000000000000..ffa1a35652c3c --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaPairRDD.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.data; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.storage.StorageLevel; + +import java.util.Map; + +import scala.Tuple2; + +/** + * Implementation of {@link HoodiePairData} using Spark {@link JavaPairRDD}. + * + * @param type of key. + * @param type of value. + */ +public class HoodieJavaPairRDD extends HoodiePairData { + + private final JavaPairRDD pairRDDData; + + private HoodieJavaPairRDD(JavaPairRDD pairRDDData) { + this.pairRDDData = pairRDDData; + } + + /** + * @param pairRDDData a {@link JavaPairRDD} of pairs. + * @param type of key. + * @param type of value. + * @return a new instance containing the {@link JavaPairRDD} reference. + */ + public static HoodieJavaPairRDD of(JavaPairRDD pairRDDData) { + return new HoodieJavaPairRDD<>(pairRDDData); + } + + /** + * @param hoodiePairData {@link HoodieJavaPairRDD } instance containing the {@link JavaPairRDD} of pairs. + * @param type of key. + * @param type of value. + * @return the {@link JavaPairRDD} of pairs. + */ + public static JavaPairRDD getJavaPairRDD(HoodiePairData hoodiePairData) { + return ((HoodieJavaPairRDD) hoodiePairData).get(); + } + + @Override + public JavaPairRDD get() { + return pairRDDData; + } + + @Override + public void persist(String storageLevel) { + pairRDDData.persist(StorageLevel.fromString(storageLevel)); + } + + @Override + public void unpersist() { + pairRDDData.unpersist(); + } + + @Override + public HoodieData keys() { + return HoodieJavaRDD.of(pairRDDData.keys()); + } + + @Override + public HoodieData values() { + return HoodieJavaRDD.of(pairRDDData.values()); + } + + @Override + public long count() { + return pairRDDData.count(); + } + + @Override + public Map countByKey() { + return pairRDDData.countByKey(); + } + + @Override + public HoodieData map(SerializableFunction, O> func) { + return HoodieJavaRDD.of(pairRDDData.map( + tuple -> func.apply(new ImmutablePair<>(tuple._1, tuple._2)))); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction, L, W> mapToPairFunc) { + return HoodieJavaPairRDD.of(pairRDDData.mapToPair(pair -> { + Pair newPair = mapToPairFunc.call(new ImmutablePair<>(pair._1, pair._2)); + return new Tuple2<>(newPair.getLeft(), newPair.getRight()); + })); + } + + @Override + public HoodiePairData>> leftOuterJoin(HoodiePairData other) { + return HoodieJavaPairRDD.of(JavaPairRDD.fromJavaRDD( + pairRDDData.leftOuterJoin(HoodieJavaPairRDD.getJavaPairRDD(other)) + .map(tuple -> new Tuple2<>(tuple._1, + new ImmutablePair<>(tuple._2._1, Option.ofNullable(tuple._2._2.orElse(null))))))); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java new file mode 100644 index 0000000000000..ceaee4728dee9 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.data; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.storage.StorageLevel; + +import java.util.Iterator; +import java.util.List; + +import scala.Tuple2; + +/** + * Holds a {@link JavaRDD} of objects. + * + * @param type of object. + */ +public class HoodieJavaRDD extends HoodieData { + + private final JavaRDD rddData; + + private HoodieJavaRDD(JavaRDD rddData) { + this.rddData = rddData; + } + + /** + * @param rddData a {@link JavaRDD} of objects in type T. + * @param type of object. + * @return a new instance containing the {@link JavaRDD} reference. + */ + public static HoodieJavaRDD of(JavaRDD rddData) { + return new HoodieJavaRDD<>(rddData); + } + + /** + * @param data a {@link List} of objects in type T. + * @param context {@link HoodieSparkEngineContext} to use. + * @param parallelism parallelism for the {@link JavaRDD}. + * @param type of object. + * @return a new instance containing the {@link JavaRDD} instance. + */ + public static HoodieJavaRDD of( + List data, HoodieSparkEngineContext context, int parallelism) { + return new HoodieJavaRDD<>(context.getJavaSparkContext().parallelize(data, parallelism)); + } + + /** + * @param hoodieData {@link HoodieJavaRDD } instance containing the {@link JavaRDD} of objects. + * @param type of object. + * @return the a {@link JavaRDD} of objects in type T. + */ + public static JavaRDD getJavaRDD(HoodieData hoodieData) { + return ((HoodieJavaRDD) hoodieData).get(); + } + + @Override + public JavaRDD get() { + return rddData; + } + + @Override + public void persist(String storageLevel) { + rddData.persist(StorageLevel.fromString(storageLevel)); + } + + @Override + public void unpersist() { + rddData.unpersist(); + } + + @Override + public boolean isEmpty() { + return rddData.isEmpty(); + } + + @Override + public long count() { + return rddData.count(); + } + + @Override + public HoodieData map(SerializableFunction func) { + return HoodieJavaRDD.of(rddData.map(func::apply)); + } + + @Override + public HoodieData mapPartitions(SerializableFunction, Iterator> func, boolean preservesPartitioning) { + return HoodieJavaRDD.of(rddData.mapPartitions(func::apply, preservesPartitioning)); + } + + @Override + public HoodieData flatMap(SerializableFunction> func) { + return HoodieJavaRDD.of(rddData.flatMap(e -> func.apply(e))); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction mapToPairFunc) { + return HoodieJavaPairRDD.of(rddData.mapToPair(input -> { + Pair pair = mapToPairFunc.call(input); + return new Tuple2<>(pair.getLeft(), pair.getRight()); + })); + } + + @Override + public HoodieData distinct() { + return HoodieJavaRDD.of(rddData.distinct()); + } + + @Override + public List collectAsList() { + return rddData.collect(); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieSparkLongAccumulator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieSparkLongAccumulator.java new file mode 100644 index 0000000000000..10027a28258c0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieSparkLongAccumulator.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.data; + +import org.apache.hudi.common.data.HoodieAccumulator; + +import org.apache.spark.util.AccumulatorV2; +import org.apache.spark.util.LongAccumulator; + +/** + * An accumulator on counts based on Spark {@link AccumulatorV2} implementation. + */ +public class HoodieSparkLongAccumulator extends HoodieAccumulator { + + private final AccumulatorV2 accumulator; + + private HoodieSparkLongAccumulator() { + accumulator = new LongAccumulator(); + } + + public static HoodieSparkLongAccumulator create() { + return new HoodieSparkLongAccumulator(); + } + + @Override + public long value() { + return accumulator.value(); + } + + @Override + public void add(long increment) { + accumulator.add(increment); + } + + public AccumulatorV2 getAccumulator() { + return accumulator; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java new file mode 100644 index 0000000000000..fa12159eeac62 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.ZCurveOptimizeHelper; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +/** + * A partitioner that does spartial curve optimization sorting based on specified column values for each RDD partition. + * support z-curve optimization, hilbert will come soon. + * @param HoodieRecordPayload type + */ +public class RDDSpatialCurveOptimizationSortPartitioner + implements BulkInsertPartitioner>> { + private final HoodieSparkEngineContext sparkEngineContext; + private final SerializableSchema serializableSchema; + private final HoodieWriteConfig config; + + public RDDSpatialCurveOptimizationSortPartitioner(HoodieSparkEngineContext sparkEngineContext, HoodieWriteConfig config, Schema schema) { + this.sparkEngineContext = sparkEngineContext; + this.config = config; + this.serializableSchema = new SerializableSchema(schema); + } + + @Override + public JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions) { + String payloadClass = config.getPayloadClass(); + // do sort + JavaRDD preparedRecord = prepareGenericRecord(records, outputSparkPartitions, serializableSchema.get()); + return preparedRecord.map(record -> { + String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieKey hoodieKey = new HoodieKey(key, partition); + HoodieRecordPayload avroPayload = ReflectionUtils.loadPayload(payloadClass, + new Object[] {Option.of(record)}, Option.class); + HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, avroPayload); + return hoodieRecord; + }); + } + + private JavaRDD prepareGenericRecord(JavaRDD> inputRecords, final int numOutputGroups, final Schema schema) { + SerializableSchema serializableSchema = new SerializableSchema(schema); + JavaRDD genericRecordJavaRDD = inputRecords.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get()); + Dataset originDF = AvroConversionUtils.createDataFrame(genericRecordJavaRDD.rdd(), schema.toString(), sparkEngineContext.getSqlContext().sparkSession()); + Dataset zDataFrame; + + switch (config.getLayoutOptimizationCurveBuildMethod()) { + case DIRECT: + zDataFrame = ZCurveOptimizeHelper.createZIndexedDataFrameByMapValue(originDF, config.getClusteringSortColumns(), numOutputGroups); + break; + case SAMPLE: + zDataFrame = ZCurveOptimizeHelper.createZIndexedDataFrameBySample(originDF, config.getClusteringSortColumns(), numOutputGroups); + break; + default: + throw new HoodieException("Not a valid build curve method for doWriteOperation: "); + } + return HoodieSparkUtils.createRdd(zDataFrame, schema.getName(), + schema.getNamespace(), false, org.apache.hudi.common.util.Option.empty()).toJavaRDD(); + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java index 45094546b5b65..62bf5c100a949 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java @@ -7,13 +7,14 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.index; @@ -21,89 +22,52 @@ import org.apache.hudi.ApiMaturityLevel; import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; -import org.apache.hudi.index.bloom.SparkHoodieGlobalBloomIndex; -import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; -import org.apache.hudi.index.simple.SparkHoodieGlobalSimpleIndex; -import org.apache.hudi.index.simple.SparkHoodieSimpleIndex; import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.JavaRDD; @SuppressWarnings("checkstyle:LineLength") -public abstract class SparkHoodieIndex extends HoodieIndex>, JavaRDD, JavaRDD> { +public abstract class SparkHoodieIndex> + extends HoodieIndex>, JavaRDD, JavaRDD> { protected SparkHoodieIndex(HoodieWriteConfig config) { super(config); } - public static SparkHoodieIndex createIndex(HoodieWriteConfig config) { - // first use index class config to create index. - if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { - Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); - if (!(instance instanceof HoodieIndex)) { - throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); - } - return (SparkHoodieIndex) instance; - } - switch (config.getIndexType()) { - case HBASE: - return new SparkHoodieHBaseIndex<>(config); - case INMEMORY: - return new SparkInMemoryHashIndex(config); - case BLOOM: - return new SparkHoodieBloomIndex<>(config); - case GLOBAL_BLOOM: - return new SparkHoodieGlobalBloomIndex<>(config); - case SIMPLE: - return new SparkHoodieSimpleIndex(config); - case GLOBAL_SIMPLE: - return new SparkHoodieGlobalSimpleIndex(config); - default: - throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); - } - } - - /** - * Whether index is global or not. - * @param config HoodieWriteConfig to use. - * @return {@code true} if index is a global one. else {@code false}. - */ - public static boolean isGlobalIndex(HoodieWriteConfig config) { - switch (config.getIndexType()) { - case HBASE: - return true; - case INMEMORY: - return true; - case BLOOM: - return false; - case GLOBAL_BLOOM: - return true; - case SIMPLE: - return false; - case GLOBAL_SIMPLE: - return true; - default: - return createIndex(config).isGlobal(); - } - } - @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException; @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract JavaRDD> tagLocation(JavaRDD> records, HoodieEngineContext context, HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException; + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieJavaRDD.of(tagLocation(HoodieJavaRDD.getJavaRDD(records), context, hoodieTable)); + } + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieJavaRDD.of(updateLocation(HoodieJavaRDD.getJavaRDD(writeStatuses), context, hoodieTable)); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java new file mode 100644 index 0000000000000..5e686463bc98f --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex; +import org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper; +import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; +import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex; +import org.apache.hudi.index.simple.HoodieGlobalSimpleIndex; +import org.apache.hudi.index.simple.HoodieSimpleIndex; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; + +import java.io.IOException; + +/** + * A factory to generate Spark {@link HoodieIndex}. + */ +public final class SparkHoodieIndexFactory { + public static HoodieIndex createIndex(HoodieWriteConfig config) { + // first use index class config to create index. + if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { + Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); + if (!(instance instanceof HoodieIndex)) { + throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); + } + return (HoodieIndex) instance; + } + switch (config.getIndexType()) { + case HBASE: + return new SparkHoodieHBaseIndex<>(config); + case INMEMORY: + return new HoodieInMemoryHashIndex<>(config); + case BLOOM: + return new HoodieBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance()); + case GLOBAL_BLOOM: + return new HoodieGlobalBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance()); + case SIMPLE: + return new HoodieSimpleIndex<>(config, getKeyGeneratorForSimpleIndex(config)); + case GLOBAL_SIMPLE: + return new HoodieGlobalSimpleIndex<>(config, getKeyGeneratorForSimpleIndex(config)); + default: + throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); + } + } + + /** + * Whether index is global or not. + * @param config HoodieWriteConfig to use. + * @return {@code true} if index is a global one. else {@code false}. + */ + public static boolean isGlobalIndex(HoodieWriteConfig config) { + switch (config.getIndexType()) { + case HBASE: + return true; + case INMEMORY: + return true; + case BLOOM: + return false; + case GLOBAL_BLOOM: + return true; + case SIMPLE: + return false; + case GLOBAL_SIMPLE: + return true; + default: + return createIndex(config).isGlobal(); + } + } + + private static Option getKeyGeneratorForSimpleIndex(HoodieWriteConfig config) { + try { + return config.populateMetaFields() ? Option.empty() + : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))); + } catch (IOException e) { + throw new HoodieIOException("KeyGenerator instantiation failed ", e); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkInMemoryHashIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkInMemoryHashIndex.java deleted file mode 100644 index 0cd839aa87be3..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkInMemoryHashIndex.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function2; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; - -/** - * Hoodie Index implementation backed by an in-memory Hash map. - *

- * ONLY USE FOR LOCAL TESTING - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkInMemoryHashIndex extends SparkHoodieIndex { - - private static ConcurrentMap recordLocationMap; - - public SparkInMemoryHashIndex(HoodieWriteConfig config) { - super(config); - synchronized (SparkInMemoryHashIndex.class) { - if (recordLocationMap == null) { - recordLocationMap = new ConcurrentHashMap<>(); - } - } - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return writeStatusRDD.map(writeStatus -> { - for (HoodieRecord record : writeStatus.getWrittenRecords()) { - if (!writeStatus.isErrored(record.getKey())) { - HoodieKey key = record.getKey(); - Option newLocation = record.getNewLocation(); - if (newLocation.isPresent()) { - recordLocationMap.put(key, newLocation.get()); - } else { - // Delete existing index for a deleted record - recordLocationMap.remove(key); - } - } - } - return writeStatus; - }); - } - - @Override - public boolean rollbackCommit(String instantTime) { - return true; - } - - /** - * Only looks up by recordKey. - */ - @Override - public boolean isGlobal() { - return true; - } - - /** - * Mapping is available in HBase already. - */ - @Override - public boolean canIndexLogFiles() { - return true; - } - - /** - * Index needs to be explicitly updated after storage write. - */ - @Override - public boolean isImplicitWithStorage() { - return false; - } - - /** - * Function that tags each HoodieRecord with an existing location, if known. - */ - class LocationTagFunction implements Function2>, Iterator>> { - - @Override - public Iterator> call(Integer partitionNum, Iterator> hoodieRecordIterator) { - List> taggedRecords = new ArrayList<>(); - while (hoodieRecordIterator.hasNext()) { - HoodieRecord rec = hoodieRecordIterator.next(); - if (recordLocationMap.containsKey(rec.getKey())) { - rec.unseal(); - rec.setCurrentLocation(recordLocationMap.get(rec.getKey())); - rec.seal(); - } - taggedRecords.add(rec); - } - return taggedRecords.iterator(); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndex.java deleted file mode 100644 index 2dd485ebc6a05..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndex.java +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.bloom; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.MetadataNotFoundException; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.index.SparkHoodieIndex; -import org.apache.hudi.io.HoodieRangeInfoHandle; -import org.apache.hudi.table.HoodieTable; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.Partitioner; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.storage.StorageLevel; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import scala.Tuple2; - -import static java.util.stream.Collectors.groupingBy; -import static java.util.stream.Collectors.mapping; -import static java.util.stream.Collectors.toList; -import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; - -/** - * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata. - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkHoodieBloomIndex extends SparkHoodieIndex { - - private static final Logger LOG = LogManager.getLogger(SparkHoodieBloomIndex.class); - - public SparkHoodieBloomIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - - // Step 0: cache the input record RDD - if (config.getBloomIndexUseCaching()) { - recordRDD.persist(SparkMemoryUtils.getBloomIndexInputStorageLevel(config.getProps())); - } - - // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) - JavaPairRDD partitionRecordKeyPairRDD = - recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); - - // Lookup indexes for all the partition/recordkey pair - JavaPairRDD keyFilenamePairRDD = - lookupIndex(partitionRecordKeyPairRDD, context, hoodieTable); - - // Cache the result, for subsequent stages. - if (config.getBloomIndexUseCaching()) { - keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); - } - if (LOG.isDebugEnabled()) { - long totalTaggedRecords = keyFilenamePairRDD.count(); - LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); - } - - // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys - // Cost: 4 sec. - JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD); - - if (config.getBloomIndexUseCaching()) { - recordRDD.unpersist(); // unpersist the input Record RDD - keyFilenamePairRDD.unpersist(); - } - return taggedRecordRDD; - } - - /** - * Lookup the location for each record key and return the pair for all record keys already - * present and drop the record keys if not present. - */ - private JavaPairRDD lookupIndex( - JavaPairRDD partitionRecordKeyPairRDD, final HoodieEngineContext context, - final HoodieTable hoodieTable) { - // Obtain records per partition, in the incoming records - Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); - List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); - - // Step 2: Load all involved files as pairs - List> fileInfoList = - loadInvolvedFiles(affectedPartitionPathList, context, hoodieTable); - final Map> partitionToFileInfo = - fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); - - // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, - // that contains it. - JavaRDD> fileComparisonsRDD = - explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD); - Map comparisonsPerFileGroup = - computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, fileComparisonsRDD, context); - int inputParallelism = partitionRecordKeyPairRDD.partitions().size(); - int joinParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); - LOG.info("InputParallelism: ${" + inputParallelism + "}, IndexParallelism: ${" - + config.getBloomIndexParallelism() + "}"); - return findMatchingFilesForRecordKeys(fileComparisonsRDD, joinParallelism, hoodieTable, - comparisonsPerFileGroup); - } - - /** - * Compute the estimated number of bloom filter comparisons to be performed on each file group. - */ - private Map computeComparisonsPerFileGroup(final Map recordsPerPartition, - final Map> partitionToFileInfo, - final JavaRDD> fileComparisonsRDD, - final HoodieEngineContext context) { - Map fileToComparisons; - if (config.getBloomIndexPruneByRanges()) { - // we will just try exploding the input and then count to determine comparisons - // FIX(vc): Only do sampling here and extrapolate? - context.setJobStatus(this.getClass().getSimpleName(), "Compute all comparisons needed between records and files"); - fileToComparisons = fileComparisonsRDD.mapToPair(t -> t).countByKey(); - } else { - fileToComparisons = new HashMap<>(); - partitionToFileInfo.forEach((key, value) -> { - for (BloomIndexFileInfo fileInfo : value) { - // each file needs to be compared against all the records coming into the partition - fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(key)); - } - }); - } - return fileToComparisons; - } - - /** - * Load all involved files as pair RDD. - */ - List> loadInvolvedFiles(List partitions, final HoodieEngineContext context, - final HoodieTable hoodieTable) { - - // Obtain the latest data files from all the partitions. - List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() - .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) - .collect(toList()); - - if (config.getBloomIndexPruneByRanges()) { - // also obtain file ranges, if range pruning is enabled - context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); - return context.map(partitionPathFileIDList, pf -> { - try { - HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); - String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); - return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); - } catch (MetadataNotFoundException me) { - LOG.warn("Unable to find range metadata in file :" + pf); - return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); - } - }, Math.max(partitionPathFileIDList.size(), 1)); - } else { - return partitionPathFileIDList.stream() - .map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); - } - } - - @Override - public boolean rollbackCommit(String instantTime) { - // Nope, don't need to do anything. - return true; - } - - /** - * This is not global, since we depend on the partitionPath to do the lookup. - */ - @Override - public boolean isGlobal() { - return false; - } - - /** - * No indexes into log files yet. - */ - @Override - public boolean canIndexLogFiles() { - return false; - } - - /** - * Bloom filters are stored, into the same data files. - */ - @Override - public boolean isImplicitWithStorage() { - return true; - } - - /** - * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be - * checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files - * to be compared gets cut down a lot from range pruning. - *

- * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on - * recordKey ranges in the index info. - */ - JavaRDD> explodeRecordRDDWithFileComparisons( - final Map> partitionToFileIndexInfo, - JavaPairRDD partitionRecordKeyPairRDD) { - IndexFileFilter indexFileFilter = - config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) - : new ListBasedIndexFileFilter(partitionToFileIndexInfo); - - return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { - String recordKey = partitionRecordKeyPair._2(); - String partitionPath = partitionRecordKeyPair._1(); - - return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() - .map(partitionFileIdPair -> new Tuple2<>(partitionFileIdPair.getRight(), - new HoodieKey(recordKey, partitionPath))) - .collect(Collectors.toList()); - }).flatMap(List::iterator); - } - - /** - * Find out pair. All workload grouped by file-level. - *

- * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD - * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey - *

- * Make sure the parallelism is atleast the groupby parallelism for tagging location - */ - JavaPairRDD findMatchingFilesForRecordKeys( - JavaRDD> fileComparisonsRDD, - int shuffleParallelism, - HoodieTable hoodieTable, - Map fileGroupToComparisons) { - - if (config.useBloomIndexBucketizedChecking()) { - Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons, - config.getBloomIndexKeysPerBucket()); - - fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) - .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2); - } else { - fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism); - } - - return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true) - .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0) - .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream() - .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), - new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()))) - .collect(Collectors.toList()).iterator()); - } - - - /** - * Tag the back to the original HoodieRecord RDD. - */ - protected JavaRDD> tagLocationBacktoRecords( - JavaPairRDD keyFilenamePairRDD, JavaRDD> recordRDD) { - JavaPairRDD> keyRecordPairRDD = - recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record)); - // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), - // so we do left outer join. - return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values() - .map(v1 -> HoodieIndexUtils.getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull()))); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return writeStatusRDD; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java new file mode 100644 index 0000000000000..bbb50d5cf6fff --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.Partitioner; +import org.apache.spark.api.java.JavaRDD; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import scala.Tuple2; + +/** + * Helper for {@link HoodieBloomIndex} containing Spark-specific logic. + */ +public class SparkHoodieBloomIndexHelper extends BaseHoodieBloomIndexHelper { + + private static final Logger LOG = LogManager.getLogger(SparkHoodieBloomIndexHelper.class); + + private static final SparkHoodieBloomIndexHelper SINGLETON_INSTANCE = + new SparkHoodieBloomIndexHelper(); + + private SparkHoodieBloomIndexHelper() { + } + + public static SparkHoodieBloomIndexHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodiePairData findMatchingFilesForRecordKeys( + HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, + HoodiePairData partitionRecordKeyPairs, + HoodieData> fileComparisonPairs, + Map> partitionToFileInfo, + Map recordsPerPartition) { + JavaRDD> fileComparisonsRDD = + HoodieJavaRDD.getJavaRDD(fileComparisonPairs) + .map(pair -> new Tuple2<>(pair.getLeft(), pair.getRight())); + Map comparisonsPerFileGroup = computeComparisonsPerFileGroup( + config, recordsPerPartition, partitionToFileInfo, fileComparisonsRDD, context); + int inputParallelism = + HoodieJavaPairRDD.getJavaPairRDD(partitionRecordKeyPairs).partitions().size(); + int joinParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); + LOG.info("InputParallelism: ${" + inputParallelism + "}, IndexParallelism: ${" + + config.getBloomIndexParallelism() + "}"); + + if (config.useBloomIndexBucketizedChecking()) { + Partitioner partitioner = new BucketizedBloomCheckPartitioner(joinParallelism, comparisonsPerFileGroup, + config.getBloomIndexKeysPerBucket()); + + fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) + .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2); + } else { + fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, joinParallelism); + } + + return HoodieJavaPairRDD.of(fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true) + .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0) + .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream() + .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), + new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()))) + .collect(Collectors.toList()).iterator())); + } + + /** + * Compute the estimated number of bloom filter comparisons to be performed on each file group. + */ + private Map computeComparisonsPerFileGroup( + final HoodieWriteConfig config, + final Map recordsPerPartition, + final Map> partitionToFileInfo, + final JavaRDD> fileComparisonsRDD, + final HoodieEngineContext context) { + Map fileToComparisons; + if (config.getBloomIndexPruneByRanges()) { + // we will just try exploding the input and then count to determine comparisons + // FIX(vc): Only do sampling here and extrapolate? + context.setJobStatus(this.getClass().getSimpleName(), "Compute all comparisons needed between records and files"); + fileToComparisons = fileComparisonsRDD.mapToPair(t -> t).countByKey(); + } else { + fileToComparisons = new HashMap<>(); + partitionToFileInfo.forEach((key, value) -> { + for (BloomIndexFileInfo fileInfo : value) { + // each file needs to be compared against all the records coming into the partition + fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(key)); + } + }); + } + return fileToComparisons; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java index 866d0d39737b6..0317b961f3d24 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.utils.SparkMemoryUtils; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; import org.apache.hudi.common.model.HoodieKey; @@ -35,9 +36,10 @@ import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.config.HoodieHBaseIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieDependentSystemUnavailableException; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; import org.apache.hadoop.conf.Configuration; @@ -83,7 +85,8 @@ /** * Hoodie Index implementation backed by HBase. */ -public class SparkHoodieHBaseIndex extends SparkHoodieIndex { +public class SparkHoodieHBaseIndex> + extends HoodieIndex>, JavaRDD, JavaRDD> { public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances"; public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled"; @@ -291,10 +294,11 @@ private Result[] doGet(HTable hTable, List keys, RateLimiter limiter) throw } @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true); + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return HoodieJavaRDD.of(HoodieJavaRDD.getJavaRDD(records) + .mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true)); } private Function2, Iterator> updateLocationFunction() { @@ -395,16 +399,17 @@ public Map mapFileWithInsertsToUniquePartition(JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, - JavaRDD> hoodieTable) { - final Option desiredQPSFraction = calculateQPSFraction(writeStatusRDD); + public HoodieData updateLocation( + HoodieData writeStatus, HoodieEngineContext context, + HoodieTable hoodieTable) { + JavaRDD writeStatusRDD = HoodieJavaRDD.getJavaRDD(writeStatus); + final Option desiredQPSFraction = calculateQPSFraction(writeStatusRDD); final Map fileIdPartitionMap = mapFileWithInsertsToUniquePartition(writeStatusRDD); JavaRDD partitionedRDD = this.numWriteStatusWithInserts == 0 ? writeStatusRDD : - writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) - .partitionBy(new WriteStatusPartitioner(fileIdPartitionMap, - this.numWriteStatusWithInserts)) - .map(w -> w._2()); + writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) + .partitionBy(new WriteStatusPartitioner(fileIdPartitionMap, + this.numWriteStatusWithInserts)) + .map(w -> w._2()); JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); acquireQPSResourcesAndSetBatchSize(desiredQPSFraction, jsc); JavaRDD writeStatusJavaRDD = partitionedRDD.mapPartitionsWithIndex(updateLocationFunction(), @@ -414,7 +419,7 @@ public JavaRDD updateLocation(JavaRDD writeStatusRDD, // force trigger update location(hbase puts) writeStatusJavaRDD.count(); this.hBaseIndexQPSResourceAllocator.releaseQPSResources(); - return writeStatusJavaRDD; + return HoodieJavaRDD.of(writeStatusJavaRDD); } private Option calculateQPSFraction(JavaRDD writeStatusRDD) { @@ -562,7 +567,7 @@ public boolean rollbackCommit(String instantTime) { BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) { final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS); - Long rollbackTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse(instantTime).getTime(); + Long rollbackTime = HoodieActiveTimeline.parseInstantTime(instantTime).getTime(); Long currentTime = new Date().getTime(); Scan scan = new Scan(); scan.addFamily(SYSTEM_COLUMN_FAMILY); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieSimpleIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieSimpleIndex.java deleted file mode 100644 index 5add5825c09f2..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieSimpleIndex.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.simple; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.index.SparkHoodieIndex; -import org.apache.hudi.io.HoodieKeyLocationFetchHandle; -import org.apache.hudi.keygen.BaseKeyGenerator; -import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.io.IOException; -import java.util.List; - -import scala.Tuple2; - -import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; - -/** - * A simple index which reads interested fields(record key and partition path) from base files and - * joins with incoming records to find the tagged location. - * - * @param - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkHoodieSimpleIndex extends SparkHoodieIndex { - - public SparkHoodieSimpleIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return writeStatusRDD; - } - - @Override - public boolean rollbackCommit(String commitTime) { - return true; - } - - @Override - public boolean isGlobal() { - return false; - } - - @Override - public boolean canIndexLogFiles() { - return false; - } - - @Override - public boolean isImplicitWithStorage() { - return true; - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return tagLocationInternal(recordRDD, context, hoodieTable); - } - - /** - * Tags records location for incoming records. - * - * @param inputRecordRDD {@link JavaRDD} of incoming records - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} to use - * @return {@link JavaRDD} of records with record locations set - */ - protected JavaRDD> tagLocationInternal(JavaRDD> inputRecordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - if (config.getSimpleIndexUseCaching()) { - inputRecordRDD.persist(SparkMemoryUtils.getSimpleIndexInputStorageLevel(config.getProps())); - } - - JavaPairRDD> keyedInputRecordRDD = inputRecordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record)); - JavaPairRDD existingLocationsOnTable = fetchRecordLocationsForAffectedPartitions(keyedInputRecordRDD.keys(), context, hoodieTable, - config.getSimpleIndexParallelism()); - - JavaRDD> taggedRecordRDD = keyedInputRecordRDD.leftOuterJoin(existingLocationsOnTable) - .map(entry -> { - final HoodieRecord untaggedRecord = entry._2._1; - final Option location = Option.ofNullable(entry._2._2.orNull()); - return HoodieIndexUtils.getTaggedRecord(untaggedRecord, location); - }); - - if (config.getSimpleIndexUseCaching()) { - inputRecordRDD.unpersist(); - } - return taggedRecordRDD; - } - - /** - * Fetch record locations for passed in {@link HoodieKey}s. - * - * @param hoodieKeys {@link JavaRDD} of {@link HoodieKey}s for which locations are fetched - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} of interest - * @param parallelism parallelism to use - * @return {@link JavaPairRDD} of {@link HoodieKey} and {@link HoodieRecordLocation} - */ - protected JavaPairRDD fetchRecordLocationsForAffectedPartitions(JavaRDD hoodieKeys, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - int parallelism) { - List affectedPartitionPathList = hoodieKeys.map(HoodieKey::getPartitionPath).distinct().collect(); - List> latestBaseFiles = getLatestBaseFilesForAllPartitions(affectedPartitionPathList, context, hoodieTable); - return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); - } - - protected JavaPairRDD fetchRecordLocations(HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - int parallelism, - List> baseFiles) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism)); - - try { - Option keyGeneratorOpt = config.populateMetaFields() ? Option.empty() - : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))); - return jsc.parallelize(baseFiles, fetchParallelism) - .flatMapToPair(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile, keyGeneratorOpt) - .locations().map(x -> Tuple2.apply(((Pair)x).getLeft(), ((Pair)x).getRight())).iterator()); - } catch (IOException e) { - throw new HoodieIOException("KeyGenerator instantiation failed ", e); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java index 774d7c0b70e9a..8dd19d8883235 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java @@ -66,7 +66,7 @@ private static HoodieInternalRowFileWriter newParquetInternalRowFileWriter( writeConfig.getDynamicBloomFilterMaxNumEntries(), writeConfig.getBloomFilterType()); HoodieRowParquetWriteSupport writeSupport = - new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter); + new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter, writeConfig); return new HoodieInternalRowParquetWriter( path, new HoodieRowParquetConfig( writeSupport, @@ -91,7 +91,7 @@ private static HoodieInternalRowFileWriter newParquetInternalRowFileWriterWithou Path path, HoodieWriteConfig writeConfig, StructType structType, HoodieTable table) throws IOException { HoodieRowParquetWriteSupport writeSupport = - new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null); + new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null, writeConfig); return new HoodieInternalRowParquetWriter( path, new HoodieRowParquetConfig( writeSupport, diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java index 83ec192e523ad..f7fe50776d0ad 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java @@ -21,6 +21,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; import org.apache.spark.sql.types.StructType; @@ -42,11 +43,11 @@ public class HoodieRowParquetWriteSupport extends ParquetWriteSupport { private String minRecordKey; private String maxRecordKey; - public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter) { + public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter, HoodieWriteConfig writeConfig) { super(); Configuration hadoopConf = new Configuration(conf); - hadoopConf.set("spark.sql.parquet.writeLegacyFormat", "false"); - hadoopConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS"); + hadoopConf.set("spark.sql.parquet.writeLegacyFormat", writeConfig.parquetWriteLegacyFormatEnabled()); + hadoopConf.set("spark.sql.parquet.outputTimestampType", writeConfig.parquetOutputTimestampType()); this.hadoopConf = hadoopConf; setSchema(structType, hadoopConf); this.bloomFilter = bloomFilter; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java index 329fdd71b5bde..5c6a0e490814b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java @@ -38,7 +38,7 @@ import scala.Option; -import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR; import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER; @@ -104,11 +104,11 @@ public static String getPartitionPathFromRow(Row row, List partitionPath Integer fieldPos = fieldPositions.get(0); // for partition path, if field is not found, index will be set to -1 if (fieldPos == -1 || row.isNullAt(fieldPos)) { - val = DEFAULT_PARTITION_PATH; + val = HUDI_DEFAULT_PARTITION_PATH; } else { val = row.getAs(field).toString(); if (val.isEmpty()) { - val = DEFAULT_PARTITION_PATH; + val = HUDI_DEFAULT_PARTITION_PATH; } } if (hiveStylePartitioning) { @@ -117,7 +117,7 @@ public static String getPartitionPathFromRow(Row row, List partitionPath } else { // nested Object nestedVal = getNestedFieldVal(row, partitionPathPositions.get(field)); if (nestedVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER) || nestedVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) { - val = hiveStylePartitioning ? field + "=" + DEFAULT_PARTITION_PATH : DEFAULT_PARTITION_PATH; + val = hiveStylePartitioning ? field + "=" + HUDI_DEFAULT_PARTITION_PATH : HUDI_DEFAULT_PARTITION_PATH; } else { val = hiveStylePartitioning ? field + "=" + nestedVal.toString() : nestedVal.toString(); } @@ -137,11 +137,11 @@ public static String getPartitionPathFromInternalRow(InternalRow row, List positions) { if (positions.size() == 1 && positions.get(0) == -1) { - return DEFAULT_PARTITION_PATH; + return HUDI_DEFAULT_PARTITION_PATH; } int index = 0; int totalCount = positions.size(); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java index 03d1e9242e907..e3a5a3310524b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java @@ -29,7 +29,7 @@ import java.io.IOException; -import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER; @@ -85,7 +85,7 @@ public String getPartitionPath(InternalRow internalRow, StructType structType) { private String getTimestampBasedPartitionPath(Object partitionPathFieldVal) { Object fieldVal = null; try { - if (partitionPathFieldVal == null || partitionPathFieldVal.toString().contains(DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER) + if (partitionPathFieldVal == null || partitionPathFieldVal.toString().contains(HUDI_DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER) || partitionPathFieldVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) { fieldVal = timestampBasedAvroKeyGenerator.getDefaultPartitionVal(); } else { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java index d4e99f7ddcd80..165b27d6ce283 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java @@ -19,14 +19,13 @@ package org.apache.hudi.keygen.factory; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieKeyGeneratorException; -import org.apache.hudi.keygen.BuiltinKeyGenerator; import org.apache.hudi.keygen.ComplexKeyGenerator; import org.apache.hudi.keygen.CustomKeyGenerator; import org.apache.hudi.keygen.GlobalDeleteKeyGenerator; -import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.keygen.NonpartitionedKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator; @@ -37,8 +36,9 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.HashMap; import java.util.Locale; -import java.util.Objects; +import java.util.Map; /** * Factory help to create {@link org.apache.hudi.keygen.KeyGenerator}. @@ -50,45 +50,73 @@ public class HoodieSparkKeyGeneratorFactory { private static final Logger LOG = LoggerFactory.getLogger(HoodieSparkKeyGeneratorFactory.class); - public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { - // keyGenerator class name has higher priority - KeyGenerator keyGenerator = KeyGenUtils.createKeyGeneratorByClassName(props); + private static final Map COMMON_TO_SPARK_KEYGENERATOR = new HashMap<>(); + static { + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.ComplexAvroKeyGenerator", + "org.apache.hudi.keygen.ComplexKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.CustomAvroKeyGenerator", + "org.apache.hudi.keygen.CustomKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.GlobalAvroDeleteKeyGenerator", + "org.apache.hudi.keygen.GlobalDeleteKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator", + "org.apache.hudi.keygen.NonpartitionedKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.SimpleAvroKeyGenerator", + "org.apache.hudi.keygen.SimpleKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator", + "org.apache.hudi.keygen.TimestampBasedKeyGenerator"); + } - return Objects.isNull(keyGenerator) ? createKeyGeneratorByType(props) : keyGenerator; + public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { + String keyGeneratorClass = getKeyGeneratorClassName(props); + try { + return (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props); + } catch (Throwable e) { + throw new IOException("Could not load key generator class " + keyGeneratorClass, e); + } } - private static BuiltinKeyGenerator createKeyGeneratorByType(TypedProperties props) throws IOException { - // Use KeyGeneratorType.SIMPLE as default keyGeneratorType - String keyGeneratorType = - props.getString(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), null); + public static String getKeyGeneratorClassName(TypedProperties props) { + String keyGeneratorClass = props.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), null); - if (StringUtils.isNullOrEmpty(keyGeneratorType)) { + if (StringUtils.isNullOrEmpty(keyGeneratorClass)) { + String keyGeneratorType = props.getString(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), KeyGeneratorType.SIMPLE.name()); LOG.info("The value of {} is empty, use SIMPLE", HoodieWriteConfig.KEYGENERATOR_TYPE.key()); - keyGeneratorType = KeyGeneratorType.SIMPLE.name(); - } - - KeyGeneratorType keyGeneratorTypeEnum; - try { - keyGeneratorTypeEnum = KeyGeneratorType.valueOf(keyGeneratorType.toUpperCase(Locale.ROOT)); - } catch (IllegalArgumentException e) { - throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); - } - switch (keyGeneratorTypeEnum) { - case SIMPLE: - return new SimpleKeyGenerator(props); - case COMPLEX: - return new ComplexKeyGenerator(props); - case TIMESTAMP: - return new TimestampBasedKeyGenerator(props); - case CUSTOM: - return new CustomKeyGenerator(props); - case NON_PARTITION: - return new NonpartitionedKeyGenerator(props); - case GLOBAL_DELETE: - return new GlobalDeleteKeyGenerator(props); - default: + KeyGeneratorType keyGeneratorTypeEnum; + try { + keyGeneratorTypeEnum = KeyGeneratorType.valueOf(keyGeneratorType.toUpperCase(Locale.ROOT)); + } catch (IllegalArgumentException e) { throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); + } + switch (keyGeneratorTypeEnum) { + case SIMPLE: + keyGeneratorClass = SimpleKeyGenerator.class.getName(); + break; + case COMPLEX: + keyGeneratorClass = ComplexKeyGenerator.class.getName(); + break; + case TIMESTAMP: + keyGeneratorClass = TimestampBasedKeyGenerator.class.getName(); + break; + case CUSTOM: + keyGeneratorClass = CustomKeyGenerator.class.getName(); + break; + case NON_PARTITION: + keyGeneratorClass = NonpartitionedKeyGenerator.class.getName(); + break; + case GLOBAL_DELETE: + keyGeneratorClass = GlobalDeleteKeyGenerator.class.getName(); + break; + default: + throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); + } } + return keyGeneratorClass; } + /** + * Convert hoodie-common KeyGenerator to SparkKeyGeneratorInterface implement. + */ + public static String convertToSparkKeyGenerator(String keyGeneratorClassName) { + return COMMON_TO_SPARK_KEYGENERATOR.getOrDefault(keyGeneratorClassName, keyGeneratorClassName); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java index 7c12a9e001024..95ab7dc79a202 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java @@ -18,26 +18,22 @@ package org.apache.hudi.metadata; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.metrics.DistributedRegistry; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; import org.apache.hadoop.conf.Configuration; import org.apache.log4j.LogManager; @@ -47,19 +43,28 @@ import java.io.IOException; import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter { private static final Logger LOG = LogManager.getLogger(SparkHoodieBackedTableMetadataWriter.class); - public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, HoodieEngineContext context) { - return new SparkHoodieBackedTableMetadataWriter(conf, writeConfig, context); + public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, + HoodieEngineContext context) { + return create(conf, writeConfig, context, Option.empty()); } - SparkHoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteConfig writeConfig, HoodieEngineContext engineContext) { - super(hadoopConf, writeConfig, engineContext); + public static HoodieTableMetadataWriter create(Configuration conf, + HoodieWriteConfig writeConfig, + HoodieEngineContext context, + Option actionMetadata) { + return new SparkHoodieBackedTableMetadataWriter(conf, writeConfig, context, actionMetadata); + } + + SparkHoodieBackedTableMetadataWriter(Configuration hadoopConf, + HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext, + Option actionMetadata) { + super(hadoopConf, writeConfig, engineContext, actionMetadata); } @Override @@ -78,7 +83,8 @@ protected void initRegistry() { } @Override - protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient) { + protected void initialize(HoodieEngineContext engineContext, + Option actionMetadata) { try { metrics.map(HoodieMetadataMetrics::registry).ifPresent(registry -> { if (registry instanceof DistributedRegistry) { @@ -88,7 +94,7 @@ protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClie }); if (enabled) { - bootstrapIfNeeded(engineContext, datasetMetaClient); + bootstrapIfNeeded(engineContext, dataMetaClient, actionMetadata); } } catch (IOException e) { LOG.error("Failed to initialize metadata table. Disabling the writer.", e); @@ -97,85 +103,59 @@ protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClie } @Override - protected void commit(List records, String partitionName, String instantTime) { + protected void commit(List records, String partitionName, String instantTime, boolean canTriggerTableService) { ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled"); - JavaRDD recordRDD = prepRecords(records, partitionName); + JavaRDD recordRDD = prepRecords(records, partitionName, 1); try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) { - writeClient.startCommitWithTime(instantTime); + if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) { + // if this is a new commit being applied to metadata for the first time + writeClient.startCommitWithTime(instantTime); + } else { + // this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable. + // for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable. + // when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes + // are upserts to metadata table and so only a new delta commit will be created. + // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is + // already part of completed commit. So, we have to manually remove the completed instant and proceed. + // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table. + HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get(); + HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant); + metadataMetaClient.reloadActiveTimeline(); + } List statuses = writeClient.upsertPreppedRecords(recordRDD, instantTime).collect(); statuses.forEach(writeStatus -> { if (writeStatus.hasErrors()) { throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime); } }); - // trigger cleaning, compaction, with suffixes based on the same instant time. This ensures that any future - // delta commits synced over will not have an instant time lesser than the last completed instant on the - // metadata table. - if (writeClient.scheduleCompactionAtInstant(instantTime + "001", Option.empty())) { - writeClient.compact(instantTime + "001"); + + // reload timeline + metadataMetaClient.reloadActiveTimeline(); + if (canTriggerTableService) { + compactIfNecessary(writeClient, instantTime); + doClean(writeClient, instantTime); } - writeClient.clean(instantTime + "002"); } // Update total size of the metadata and count of base/log files - metrics.ifPresent(m -> { - try { - Map stats = m.getStats(false, metaClient, metadata); - m.updateMetrics(Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE)), - Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE)), - Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_BASE_FILES)), - Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_LOG_FILES))); - } catch (HoodieIOException e) { - LOG.error("Could not publish metadata size metrics", e); - } - }); + metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata)); } /** - * Tag each record with the location. + * Tag each record with the location in the given partition. * - * Since we only read the latest base file in a partition, we tag the records with the instant time of the latest - * base file. + * The record is tagged with respective file slice's location based on its record key. */ - private JavaRDD prepRecords(List records, String partitionName) { - HoodieTable table = HoodieSparkTable.create(metadataWriteConfig, engineContext); - TableFileSystemView.SliceView fsView = table.getSliceView(); - List baseFiles = fsView.getLatestFileSlices(partitionName) - .map(FileSlice::getBaseFile) - .filter(Option::isPresent) - .map(Option::get) - .collect(Collectors.toList()); - - // All the metadata fits within a single base file - if (partitionName.equals(MetadataPartitionType.FILES.partitionPath())) { - if (baseFiles.size() > 1) { - throw new HoodieMetadataException("Multiple base files found in metadata partition"); - } - } + private JavaRDD prepRecords(List records, String partitionName, int numFileGroups) { + List fileSlices = HoodieTableMetadataUtil.loadPartitionFileGroupsWithLatestFileSlices(metadataMetaClient, partitionName); + ValidationUtils.checkArgument(fileSlices.size() == numFileGroups, String.format("Invalid number of file groups: found=%d, required=%d", fileSlices.size(), numFileGroups)); JavaSparkContext jsc = ((HoodieSparkEngineContext) engineContext).getJavaSparkContext(); - String fileId; - String instantTime; - if (!baseFiles.isEmpty()) { - fileId = baseFiles.get(0).getFileId(); - instantTime = baseFiles.get(0).getCommitTime(); - } else { - // If there is a log file then we can assume that it has the data - List logFiles = fsView.getLatestFileSlices(MetadataPartitionType.FILES.partitionPath()) - .map(FileSlice::getLatestLogFile) - .filter(Option::isPresent) - .map(Option::get) - .collect(Collectors.toList()); - if (logFiles.isEmpty()) { - // No base and log files. All are new inserts - return jsc.parallelize(records, 1); - } - - fileId = logFiles.get(0).getFileId(); - instantTime = logFiles.get(0).getBaseCommitTime(); - } - - return jsc.parallelize(records, 1).map(r -> r.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId))); + return jsc.parallelize(records, 1).map(r -> { + FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), numFileGroups)); + r.setCurrentLocation(new HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId())); + return r; + }); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index 6a2bd6fb2f86f..0971b87c44675 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -18,12 +18,14 @@ package org.apache.hudi.table; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; @@ -33,6 +35,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -49,8 +52,8 @@ import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor; -import org.apache.hudi.table.action.clean.SparkCleanActionExecutor; -import org.apache.hudi.table.action.clean.SparkCleanPlanActionExecutor; +import org.apache.hudi.table.action.clean.CleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; import org.apache.hudi.table.action.cluster.SparkClusteringPlanActionExecutor; import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor; @@ -64,18 +67,22 @@ import org.apache.hudi.table.action.commit.SparkMergeHelper; import org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor; -import org.apache.hudi.table.action.restore.SparkCopyOnWriteRestoreActionExecutor; -import org.apache.hudi.table.action.rollback.SparkCopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.spark.ZCurveOptimizeHelper; import org.apache.spark.api.java.JavaRDD; +import scala.collection.JavaConversions; import java.io.IOException; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; /** * Implementation of a very heavily read-optimized Hoodie Table where, all data is stored in base files, with @@ -85,7 +92,8 @@ *

* UPDATES - Produce a new version of the file, just replacing the updated records with new values */ -public class HoodieSparkCopyOnWriteTable extends HoodieSparkTable { +public class HoodieSparkCopyOnWriteTable + extends HoodieSparkTable implements HoodieCompactionHandler { private static final Logger LOG = LogManager.getLogger(HoodieSparkCopyOnWriteTable.class); @@ -93,6 +101,11 @@ public HoodieSparkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext super(config, context, metaClient); } + @Override + public boolean isTableServiceAction(String actionType) { + return !actionType.equals(HoodieTimeline.COMMIT_ACTION); + } + @Override public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, JavaRDD> records) { return new SparkUpsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); @@ -149,13 +162,40 @@ public HoodieWriteMetadata> insertOverwriteTable(HoodieEngi return new SparkInsertOverwriteTableCommitActionExecutor(context, config, this, instantTime, records).execute(); } + @Override + public void updateStatistics(HoodieEngineContext context, List stats, String instantTime, Boolean isOptimizeOperation) { + // deal with z-order/hilbert statistic info + if (isOptimizeOperation) { + updateOptimizeOperationStatistics(context, stats, instantTime); + } + } + + private void updateOptimizeOperationStatistics(HoodieEngineContext context, List stats, String instantTime) { + String cols = config.getClusteringSortColumns(); + String basePath = metaClient.getBasePath(); + String indexPath = metaClient.getZindexPath(); + List validateCommits = metaClient.getCommitsTimeline() + .filterCompletedInstants().getInstants().map(f -> f.getTimestamp()).collect(Collectors.toList()); + List touchFiles = stats.stream().map(s -> new Path(basePath, s.getPath()).toString()).collect(Collectors.toList()); + if (touchFiles.isEmpty() || cols.isEmpty() || indexPath.isEmpty()) { + LOG.warn("save nothing to index table"); + return; + } + HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext)context; + ZCurveOptimizeHelper.saveStatisticsInfo(sparkEngineContext + .getSqlContext().sparkSession().read().load(JavaConversions.asScalaBuffer(touchFiles)), + cols, indexPath, instantTime, validateCommits); + LOG.info(String.format("save statistic info sucessfully at commitTime: %s", instantTime)); + } + @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, Option> extraMetadata) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); } @Override - public HoodieWriteMetadata> compact(HoodieEngineContext context, String compactionInstantTime) { + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); } @@ -179,23 +219,32 @@ public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngine @Override public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { - new SparkCopyOnWriteRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + new CopyOnWriteRestoreActionExecutor(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); } @Override public Option scheduleCleaning(HoodieEngineContext context, String instantTime, Option> extraMetadata) { - return new SparkCleanPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute(); + return new CleanPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); } - public Iterator> handleUpdate(String instantTime, String partitionPath, String fileId, + @Override + public Option scheduleRollback(HoodieEngineContext context, + String instantTime, + HoodieInstant instantToRollback, boolean skipTimelinePublish) { + return new BaseRollbackPlanActionExecutor<>(context, config, this, instantTime, instantToRollback, skipTimelinePublish).execute(); + } + + @Override + public Iterator> handleUpdate( + String instantTime, String partitionPath, String fileId, Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException { // these are updates HoodieMergeHandle upsertHandle = getUpdateHandle(instantTime, partitionPath, fileId, keyToNewRecords, oldDataFile); return handleUpdateInternal(upsertHandle, instantTime, fileId); } - protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, - String fileId) throws IOException { + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, + String fileId) throws IOException { if (upsertHandle.getOldFilePath() == null) { throw new HoodieUpsertException( "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); @@ -232,22 +281,26 @@ protected HoodieMergeHandle getUpdateHandle(String instantTime, String partition } } - public Iterator> handleInsert(String instantTime, String partitionPath, String fileId, + @Override + public Iterator> handleInsert( + String instantTime, String partitionPath, String fileId, Map> recordMap) { - HoodieCreateHandle createHandle = + HoodieCreateHandle createHandle = new HoodieCreateHandle(config, instantTime, this, partitionPath, fileId, recordMap, taskContextSupplier); createHandle.write(); return Collections.singletonList(createHandle.close()).iterator(); } @Override - public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime) { - return new SparkCleanActionExecutor((HoodieSparkEngineContext)context, config, this, cleanInstantTime).execute(); + public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime, boolean skipLocking) { + return new CleanActionExecutor(context, config, this, cleanInstantTime, skipLocking).execute(); } @Override - public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, boolean deleteInstants) { - return new SparkCopyOnWriteRollbackActionExecutor((HoodieSparkEngineContext) context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, + boolean deleteInstants, boolean skipLocking) { + return new CopyOnWriteRollbackActionExecutor((HoodieSparkEngineContext) context, config, this, rollbackInstantTime, commitInstant, + deleteInstants, skipLocking).execute(); } @Override @@ -257,7 +310,7 @@ public HoodieSavepointMetadata savepoint(HoodieEngineContext context, String ins @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { - return new SparkCopyOnWriteRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, restoreInstantTime, instantToRestore).execute(); + return new CopyOnWriteRestoreActionExecutor(context, config, this, restoreInstantTime, instantToRestore).execute(); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java index 997116ec06448..9e053aaa0da44 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -38,8 +39,9 @@ import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.SparkBootstrapDeltaCommitActionExecutor; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; -import org.apache.hudi.table.action.compact.SparkRunCompactionActionExecutor; -import org.apache.hudi.table.action.compact.SparkScheduleCompactionActionExecutor; +import org.apache.hudi.table.action.compact.HoodieSparkMergeOnReadTableCompactor; +import org.apache.hudi.table.action.compact.RunCompactionActionExecutor; +import org.apache.hudi.table.action.compact.ScheduleCompactionActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkBulkInsertPreppedDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor; @@ -47,9 +49,10 @@ import org.apache.hudi.table.action.deltacommit.SparkInsertPreppedDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkUpsertPreppedDeltaCommitActionExecutor; -import org.apache.hudi.table.action.compact.BaseScheduleCompactionActionExecutor; -import org.apache.hudi.table.action.restore.SparkMergeOnReadRestoreActionExecutor; -import org.apache.hudi.table.action.rollback.SparkMergeOnReadRollbackActionExecutor; +import org.apache.hudi.table.action.restore.MergeOnReadRestoreActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor; + import org.apache.spark.api.java.JavaRDD; import java.util.List; @@ -77,6 +80,11 @@ public class HoodieSparkMergeOnReadTable extends super(config, context, metaClient); } + @Override + public boolean isTableServiceAction(String actionType) { + return !actionType.equals(HoodieTimeline.DELTA_COMMIT_ACTION); + } + @Override public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, JavaRDD> records) { return new SparkUpsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); @@ -120,15 +128,19 @@ public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineC @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, Option> extraMetadata) { - BaseScheduleCompactionActionExecutor scheduleCompactionExecutor = new SparkScheduleCompactionActionExecutor( - context, config, this, instantTime, extraMetadata); + ScheduleCompactionActionExecutor scheduleCompactionExecutor = new ScheduleCompactionActionExecutor( + context, config, this, instantTime, extraMetadata, + new HoodieSparkMergeOnReadTableCompactor()); return scheduleCompactionExecutor.execute(); } @Override - public HoodieWriteMetadata> compact(HoodieEngineContext context, String compactionInstantTime) { - SparkRunCompactionActionExecutor compactionExecutor = new SparkRunCompactionActionExecutor((HoodieSparkEngineContext) context, config, this, compactionInstantTime); - return compactionExecutor.execute(); + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { + RunCompactionActionExecutor compactionExecutor = new RunCompactionActionExecutor( + context, config, this, compactionInstantTime, new HoodieSparkMergeOnReadTableCompactor(), + new HoodieSparkCopyOnWriteTable(config, context, getMetaClient())); + return convertMetadata(compactionExecutor.execute()); } @Override @@ -138,20 +150,28 @@ public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngine @Override public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { - new SparkMergeOnReadRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + new MergeOnReadRestoreActionExecutor(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + } + + @Override + public Option scheduleRollback(HoodieEngineContext context, + String instantTime, + HoodieInstant instantToRollback, boolean skipTimelinePublish) { + return new BaseRollbackPlanActionExecutor<>(context, config, this, instantTime, instantToRollback, skipTimelinePublish).execute(); } @Override public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, - boolean deleteInstants) { - return new SparkMergeOnReadRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + boolean deleteInstants, + boolean skipLocking) { + return new MergeOnReadRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants, skipLocking).execute(); } @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { - return new SparkMergeOnReadRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, restoreInstantTime, instantToRestore).execute(); + return new MergeOnReadRestoreActionExecutor(context, config, this, restoreInstantTime, instantToRestore).execute(); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java index 70a57b79e0f43..abbfd316741a2 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java @@ -18,8 +18,10 @@ package org.apache.hudi.table; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -29,41 +31,106 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; +import java.io.IOException; + +import static org.apache.hudi.data.HoodieJavaRDD.getJavaRDD; + public abstract class HoodieSparkTable extends HoodieTable>, JavaRDD, JavaRDD> { + private boolean isMetadataAvailabilityUpdated = false; + private boolean isMetadataTableAvailable; + protected HoodieSparkTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { super(config, context, metaClient); } public static HoodieSparkTable create(HoodieWriteConfig config, HoodieEngineContext context) { + return create(config, context, false); + } + + public static HoodieSparkTable create(HoodieWriteConfig config, HoodieEngineContext context, + boolean refreshTimeline) { HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(config.getBasePath()) .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))).build(); - return HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient); + return HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient, refreshTimeline); } public static HoodieSparkTable create(HoodieWriteConfig config, HoodieSparkEngineContext context, HoodieTableMetaClient metaClient) { + return create(config, context, metaClient, false); + } + + public static HoodieSparkTable create(HoodieWriteConfig config, + HoodieSparkEngineContext context, + HoodieTableMetaClient metaClient, + boolean refreshTimeline) { + HoodieSparkTable hoodieSparkTable; switch (metaClient.getTableType()) { case COPY_ON_WRITE: - return new HoodieSparkCopyOnWriteTable<>(config, context, metaClient); + hoodieSparkTable = new HoodieSparkCopyOnWriteTable<>(config, context, metaClient); + break; case MERGE_ON_READ: - return new HoodieSparkMergeOnReadTable<>(config, context, metaClient); + hoodieSparkTable = new HoodieSparkMergeOnReadTable<>(config, context, metaClient); + break; default: throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } + if (refreshTimeline) { + hoodieSparkTable.getHoodieView().sync(); + } + return hoodieSparkTable; + } + + public static HoodieWriteMetadata> convertMetadata( + HoodieWriteMetadata> metadata) { + return metadata.clone(getJavaRDD(metadata.getWriteStatuses())); + } + + @Override + protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context) { + return SparkHoodieIndexFactory.createIndex(config); } + /** + * Fetch instance of {@link HoodieTableMetadataWriter}. + * + * @return instance of {@link HoodieTableMetadataWriter} + */ @Override - protected HoodieIndex>, JavaRDD, JavaRDD> getIndex(HoodieWriteConfig config, HoodieEngineContext context) { - return SparkHoodieIndex.createIndex(config); + public Option getMetadataWriter(Option actionMetadata) { + synchronized (this) { + if (!isMetadataAvailabilityUpdated) { + // This code assumes that if metadata availability is updated once it will not change. + // Please revisit this logic if that's not the case. This is done to avoid repeated calls to fs.exists(). + try { + isMetadataTableAvailable = config.isMetadataTableEnabled() + && metaClient.getFs().exists(new Path(HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath()))); + } catch (IOException e) { + throw new HoodieMetadataException("Checking existence of metadata table failed", e); + } + isMetadataAvailabilityUpdated = true; + } + } + if (isMetadataTableAvailable) { + return Option.of(SparkHoodieBackedTableMetadataWriter.create(context.getHadoopConf().get(), config, context, + actionMetadata)); + } else { + return Option.empty(); + } } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java new file mode 100644 index 0000000000000..45a0e91335321 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.BootstrapWriteStatus; +import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BootstrapFileMapping; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.keygen.KeyGeneratorInterface; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroReadSupport; + +import java.io.IOException; + +public abstract class BaseBootstrapMetadataHandler implements BootstrapMetadataHandler { + private static final Logger LOG = LogManager.getLogger(ParquetBootstrapMetadataHandler.class); + protected HoodieWriteConfig config; + protected HoodieTable table; + protected HoodieFileStatus srcFileStatus; + + public BaseBootstrapMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + this.config = config; + this.table = table; + this.srcFileStatus = srcFileStatus; + } + + public BootstrapWriteStatus runMetadataBootstrap(String srcPartitionPath, String partitionPath, KeyGeneratorInterface keyGenerator) { + Path sourceFilePath = FileStatusUtils.toPath(srcFileStatus.getPath()); + HoodieBootstrapHandle bootstrapHandle = new HoodieBootstrapHandle(config, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, + table, partitionPath, FSUtils.createNewFileIdPfx(), table.getTaskContextSupplier()); + try { + Schema avroSchema = getAvroSchema(sourceFilePath); + Schema recordKeySchema = HoodieAvroUtils.generateProjectionSchema(avroSchema, + keyGenerator.getRecordKeyFieldNames()); + LOG.info("Schema to be used for reading record Keys :" + recordKeySchema); + AvroReadSupport.setAvroReadSchema(table.getHadoopConf(), recordKeySchema); + AvroReadSupport.setRequestedProjection(table.getHadoopConf(), recordKeySchema); + executeBootstrap(bootstrapHandle, sourceFilePath, keyGenerator, partitionPath, avroSchema); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + + BootstrapWriteStatus writeStatus = (BootstrapWriteStatus) bootstrapHandle.writeStatuses().get(0); + BootstrapFileMapping bootstrapFileMapping = new BootstrapFileMapping( + config.getBootstrapSourceBasePath(), srcPartitionPath, partitionPath, + srcFileStatus, writeStatus.getFileId()); + writeStatus.setBootstrapSourceFileMapping(bootstrapFileMapping); + return writeStatus; + } + + abstract Schema getAvroSchema(Path sourceFilePath) throws IOException; + + abstract void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, + Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception; +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java similarity index 54% rename from hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java rename to hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java index 5d6e57e426a0f..75daca739c8f5 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java @@ -16,21 +16,23 @@ * limitations under the License. */ -package org.apache.hudi.table.upgrade; +package org.apache.hudi.table.action.bootstrap; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.client.bootstrap.BootstrapWriteStatus; +import org.apache.hudi.keygen.KeyGeneratorInterface; /** - * Downgrade handle to assist in downgrading hoodie table from version 1 to 0. + * Bootstrap metadata handler to assist in bootstrapping only metadata. */ -public class OneToZeroDowngradeHandler extends BaseOneToZeroDowngradeHandler { - - @Override - HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { - return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); - } +public interface BootstrapMetadataHandler { + /** + * Execute bootstrap with only metatata. + * @param srcPartitionPath source partition path. + * @param partitionPath destination partition path. + * @param keyGenerator key generator to use. + * @return the {@link BootstrapWriteStatus} which has the result of execution. + */ + BootstrapWriteStatus runMetadataBootstrap(String srcPartitionPath, String partitionPath, KeyGeneratorInterface keyGenerator); } + + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/MetadataBootstrapHandlerFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/MetadataBootstrapHandlerFactory.java new file mode 100644 index 0000000000000..533e7ad27a8fa --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/MetadataBootstrapHandlerFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.avro.model.HoodieFileStatus; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + +public class MetadataBootstrapHandlerFactory { + + public static BootstrapMetadataHandler getMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + Path sourceFilePath = FileStatusUtils.toPath(srcFileStatus.getPath()); + + String extension = FSUtils.getFileExtension(sourceFilePath.toString()); + BootstrapMetadataHandler bootstrapMetadataHandler; + if (ORC.getFileExtension().equals(extension)) { + return new OrcBootstrapMetadataHandler(config, table, srcFileStatus); + } else if (PARQUET.getFileExtension().equals(extension)) { + return new ParquetBootstrapMetadataHandler(config, table, srcFileStatus); + } else { + throw new HoodieIOException("Bootstrap Metadata Handler not implemented for base file format " + extension); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java new file mode 100644 index 0000000000000..9587c5b30cb74 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.hudi.common.util.OrcReaderIterator; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; +import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.keygen.KeyGeneratorInterface; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; + +import java.io.IOException; + +class OrcBootstrapMetadataHandler extends BaseBootstrapMetadataHandler { + private static final Logger LOG = LogManager.getLogger(OrcBootstrapMetadataHandler.class); + + public OrcBootstrapMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + super(config, table, srcFileStatus); + } + + @Override + Schema getAvroSchema(Path sourceFilePath) throws IOException { + Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf())); + TypeDescription orcSchema = orcReader.getSchema(); + return AvroOrcUtils.createAvroSchema(orcSchema); + } + + @Override + void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, + String partitionPath, Schema avroSchema) throws Exception { + BoundedInMemoryExecutor wrapper = null; + Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf())); + TypeDescription orcSchema = orcReader.getSchema(); + try (RecordReader reader = orcReader.rows(new Reader.Options(table.getHadoopConf()).schema(orcSchema))) { + wrapper = new SparkBoundedInMemoryExecutor(config, + new OrcReaderIterator(reader, avroSchema, orcSchema), new BootstrapRecordConsumer(bootstrapHandle), inp -> { + String recKey = keyGenerator.getKey(inp).getRecordKey(); + GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); + gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); + BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); + HoodieRecord rec = new HoodieRecord(new HoodieKey(recKey, partitionPath), payload); + return rec; + }); + wrapper.execute(); + } catch (Exception e) { + throw new HoodieException(e); + } finally { + bootstrapHandle.close(); + if (null != wrapper) { + wrapper.shutdownNow(); + } + } + } +} + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java new file mode 100644 index 0000000000000..058c2d4267abb --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ParquetReaderIterator; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; +import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.keygen.KeyGeneratorInterface; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; + +class ParquetBootstrapMetadataHandler extends BaseBootstrapMetadataHandler { + private static final Logger LOG = LogManager.getLogger(ParquetBootstrapMetadataHandler.class); + + public ParquetBootstrapMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + super(config, table, srcFileStatus); + } + + @Override + Schema getAvroSchema(Path sourceFilePath) throws IOException { + ParquetMetadata readFooter = ParquetFileReader.readFooter(table.getHadoopConf(), sourceFilePath, + ParquetMetadataConverter.NO_FILTER); + MessageType parquetSchema = readFooter.getFileMetaData().getSchema(); + return new AvroSchemaConverter().convert(parquetSchema); + } + + @Override + void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, + Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception { + BoundedInMemoryExecutor wrapper = null; + try { + ParquetReader reader = + AvroParquetReader.builder(sourceFilePath).withConf(table.getHadoopConf()).build(); + wrapper = new SparkBoundedInMemoryExecutor(config, + new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> { + String recKey = keyGenerator.getKey(inp).getRecordKey(); + GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); + gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); + BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); + HoodieRecord rec = new HoodieRecord(new HoodieKey(recKey, partitionPath), payload); + return rec; + }); + wrapper.execute(); + } catch (Exception e) { + throw new HoodieException(e); + } finally { + bootstrapHandle.close(); + if (null != wrapper) { + wrapper.shutdownNow(); + } + } + } +} + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java index 821b3071e145c..8b353d64c4f5a 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java @@ -55,17 +55,15 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieKeyGeneratorException; -import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; import org.apache.hudi.io.HoodieBootstrapHandle; import org.apache.hudi.keygen.KeyGeneratorInterface; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; -import org.apache.hudi.metadata.HoodieTableMetadataWriter; -import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -102,6 +100,8 @@ import java.util.Map; import java.util.stream.Collectors; +import static org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler; + public class SparkBootstrapCommitActionExecutor> extends BaseCommitActionExecutor>, JavaRDD, JavaRDD, HoodieBootstrapWriteMetadata> { @@ -185,8 +185,8 @@ private void updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, H writeStatusRDD = writeStatusRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps())); Instant indexStartTime = Instant.now(); // Update the index back - JavaRDD statuses = table.getIndex().updateLocation(writeStatusRDD, context, - table); + JavaRDD statuses = HoodieJavaRDD.getJavaRDD( + table.getIndex().updateLocation(HoodieJavaRDD.of(writeStatusRDD), context, table)); result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); result.setWriteStatuses(statuses); commitOnAutoCommit(result); @@ -226,17 +226,6 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta LOG.info("Committing metadata bootstrap !!"); } - @Override - protected void syncTableMetadata() { - // Open up the metadata table again, for syncing - try (HoodieTableMetadataWriter writer = - SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context)) { - LOG.info("Successfully synced to metadata table"); - } catch (Exception e) { - throw new HoodieMetadataException("Error syncing to metadata table.", e); - } - } - protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List stats) { String actionType = table.getMetaClient().getCommitActionType(); LOG.info("Committing " + instantTime + ", action Type " + actionType); @@ -252,7 +241,6 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta // Finalize write finalizeWrite(instantTime, stats, result); - syncTableMetadata(); // add in extra metadata if (extraMetadata.isPresent()) { extraMetadata.get().forEach(metadata::addMetadata); @@ -260,6 +248,8 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, getSchemaToStoreInCommit()); metadata.setOperationType(operationType); + writeTableMetadata(metadata, actionType); + try { activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); @@ -410,8 +400,8 @@ private JavaRDD runMetadataBootstrap(List handleMetadataBootstrap(partitionFsPair.getLeft(), partitionFsPair.getRight().getLeft(), - partitionFsPair.getRight().getRight(), keyGenerator)); + .map(partitionFsPair -> getMetadataHandler(config, table, partitionFsPair.getRight().getRight()).runMetadataBootstrap(partitionFsPair.getLeft(), + partitionFsPair.getRight().getLeft(), keyGenerator)); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanActionExecutor.java deleted file mode 100644 index ba2d42f434861..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanActionExecutor.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.avro.model.HoodieActionInstant; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieCleanStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.CleanFileInfo; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFlatMapFunction; -import scala.Tuple2; - -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkCleanActionExecutor extends - BaseCleanActionExecutor>, JavaRDD, JavaRDD> { - - private static final Logger LOG = LogManager.getLogger(SparkCleanActionExecutor.class); - - public SparkCleanActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime) { - super(context, config, table, instantTime); - } - - private static PairFlatMapFunction>, String, PartitionCleanStat> - deleteFilesFunc(HoodieTable table) { - return (PairFlatMapFunction>, String, PartitionCleanStat>) iter -> { - Map partitionCleanStatMap = new HashMap<>(); - FileSystem fs = table.getMetaClient().getFs(); - while (iter.hasNext()) { - Tuple2 partitionDelFileTuple = iter.next(); - String partitionPath = partitionDelFileTuple._1(); - Path deletePath = new Path(partitionDelFileTuple._2().getFilePath()); - String deletePathStr = deletePath.toString(); - Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); - if (!partitionCleanStatMap.containsKey(partitionPath)) { - partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); - } - boolean isBootstrapBasePathFile = partitionDelFileTuple._2().isBootstrapBaseFile(); - PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); - if (isBootstrapBasePathFile) { - // For Bootstrap Base file deletions, store the full file path. - partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true); - partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true); - } else { - partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false); - partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false); - } - } - return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue())) - .collect(Collectors.toList()).iterator(); - }; - } - - @Override - List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - int cleanerParallelism = Math.min( - (int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), - config.getCleanerParallelism()); - LOG.info("Using cleanerParallelism: " + cleanerParallelism); - - context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions"); - List> partitionCleanStats = jsc - .parallelize(cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream() - .flatMap(x -> x.getValue().stream().map(y -> new Tuple2<>(x.getKey(), - new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile())))) - .collect(Collectors.toList()), cleanerParallelism) - .mapPartitionsToPair(deleteFilesFunc(table)) - .reduceByKey(PartitionCleanStat::merge).collect(); - - Map partitionCleanStatsMap = partitionCleanStats.stream() - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); - - // Return PartitionCleanStat for each partition passed. - return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> { - PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) - ? partitionCleanStatsMap.get(partitionPath) - : new PartitionCleanStat(partitionPath); - HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain(); - return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath) - .withEarliestCommitRetained(Option.ofNullable( - actionInstant != null - ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), - actionInstant.getAction(), actionInstant.getTimestamp()) - : null)) - .withDeletePathPattern(partitionCleanStat.deletePathPatterns()) - .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()) - .withFailedDeletes(partitionCleanStat.failedDeleteFiles()) - .withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()) - .withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()) - .withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()) - .build(); - }).collect(Collectors.toList()); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanPlanActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanPlanActionExecutor.java deleted file mode 100644 index f5529a8b740a1..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanPlanActionExecutor.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; - -import java.util.Map; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkCleanPlanActionExecutor extends - BaseCleanPlanActionExecutor>, JavaRDD, JavaRDD> { - - private static final Logger LOG = LogManager.getLogger(SparkCleanPlanActionExecutor.class); - - public SparkCleanPlanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } - - @Override - protected Option createCleanerPlan() { - return super.execute(); - } - -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java index 2bc1f0302798e..2bcd6d787a268 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java @@ -38,19 +38,17 @@ import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.execution.SparkLazyInsertIterable; import org.apache.hudi.io.CreateHandleFactory; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieSortedMergeHandle; -import org.apache.hudi.io.storage.HoodieConcatHandle; +import org.apache.hudi.io.HoodieConcatHandle; import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; -import org.apache.hudi.metadata.HoodieTableMetadataWriter; -import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; @@ -234,12 +232,13 @@ protected JavaRDD updateIndex(JavaRDD writeStatusRDD, writeStatusRDD = writeStatusRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps())); Instant indexStartTime = Instant.now(); // Update the index back - JavaRDD statuses = table.getIndex().updateLocation(writeStatusRDD, context, table); + JavaRDD statuses = HoodieJavaRDD.getJavaRDD( + table.getIndex().updateLocation(HoodieJavaRDD.of(writeStatusRDD), context, table)); result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); result.setWriteStatuses(statuses); return statuses; } - + protected void updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, HoodieWriteMetadata result) { updateIndex(writeStatusRDD, result); result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(result)); @@ -264,13 +263,11 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta result.setWriteStats(writeStats); // Finalize write finalizeWrite(instantTime, writeStats, result); - syncTableMetadata(); try { - LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType()); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); - + writeTableMetadata(metadata, actionType); activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); LOG.info("Committed " + instantTime); @@ -354,17 +351,6 @@ protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, } } - @Override - public void syncTableMetadata() { - // Open up the metadata table again, for syncing - try (HoodieTableMetadataWriter writer = - SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context)) { - LOG.info("Successfully synced to metadata table"); - } catch (Exception e) { - throw new HoodieMetadataException("Error syncing to metadata table.", e); - } - } - @Override public Iterator> handleInsert(String idPfx, Iterator> recordItr) throws Exception { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index 322d19194ae81..9013901c9a2ee 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -69,7 +69,7 @@ public HoodieWriteMetadata> bulkInsert(final JavaRDD writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), false); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java index 83ead05085015..5c3b4ca22f845 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; @@ -95,8 +96,8 @@ public HoodieWriteMetadata> execute(String instantTime, dedupedKeys.map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload())); Instant beginTag = Instant.now(); // perform index loop up to get existing location of records - JavaRDD> taggedRecords = - table.getIndex().tagLocation(dedupedRecords, context, table); + JavaRDD> taggedRecords = HoodieJavaRDD.getJavaRDD( + table.getIndex().tagLocation(HoodieJavaRDD.of(dedupedRecords), context, table)); Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); // filter out non existent keys/records diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index 38820be534fa3..f4eff44a26f3a 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -19,10 +19,13 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.JavaRDD; @@ -47,9 +50,15 @@ public static SparkWriteHelper newInstance() { } @Override - public JavaRDD> deduplicateRecords(JavaRDD> records, - HoodieIndex>, JavaRDD, JavaRDD> index, - int parallelism) { + protected JavaRDD> tag(JavaRDD> dedupedRecords, HoodieEngineContext context, + HoodieTable>, JavaRDD, JavaRDD> table) { + return HoodieJavaRDD.getJavaRDD( + table.getIndex().tagLocation(HoodieJavaRDD.of(dedupedRecords), context, table)); + } + + @Override + public JavaRDD> deduplicateRecords( + JavaRDD> records, HoodieIndex index, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java index 2785403bab90b..6ca4408a7bba2 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java @@ -18,248 +18,41 @@ package org.apache.hudi.table.action.compact; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieCompactionOperation; -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.CompactionOperation; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.client.utils.SparkMemoryUtils; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.TableSchemaResolver; -import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.IOUtils; -import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.util.AccumulatorV2; -import org.apache.spark.util.LongAccumulator; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; -import static java.util.stream.Collectors.toList; +import org.apache.spark.api.java.JavaRDD; /** * Compacts a hoodie table with merge on read storage. Computes all possible compactions, * passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make * a normal commit - * */ @SuppressWarnings("checkstyle:LineLength") -public class HoodieSparkMergeOnReadTableCompactor implements HoodieCompactor>, JavaRDD, JavaRDD> { - - private static final Logger LOG = LogManager.getLogger(HoodieSparkMergeOnReadTableCompactor.class); - // Accumulator to keep track of total log files for a table - private AccumulatorV2 totalLogFiles; - // Accumulator to keep track of total log file slices for a table - private AccumulatorV2 totalFileSlices; +public class HoodieSparkMergeOnReadTableCompactor + extends HoodieCompactor>, JavaRDD, JavaRDD> { @Override - public JavaRDD compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, - HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - if (compactionPlan == null || (compactionPlan.getOperations() == null) - || (compactionPlan.getOperations().isEmpty())) { - return jsc.emptyRDD(); - } - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); - - // Here we firstly use the table schema as the reader schema to read - // log file.That is because in the case of MergeInto, the config.getSchema may not - // the same with the table schema. - try { - Schema readerSchema = schemaUtil.getTableAvroSchema(false); - config.setSchema(readerSchema.toString()); - } catch (Exception e) { - // If there is no commit in the table, just ignore the exception. + public void preCompact( + HoodieTable table, HoodieTimeline pendingCompactionTimeline, String compactionInstantTime) { + HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); + if (!pendingCompactionTimeline.containsInstant(instant)) { + throw new IllegalStateException( + "No Compaction request available at " + compactionInstantTime + " to run compaction"); } - - // Compacting is very similar to applying updates to existing file - HoodieSparkCopyOnWriteTable table = new HoodieSparkCopyOnWriteTable(config, context, metaClient); - List operations = compactionPlan.getOperations().stream() - .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList()); - LOG.info("Compactor compacting " + operations + " files"); - - context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices"); - return jsc.parallelize(operations, operations.size()) - .map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator); - } - - private List compact(HoodieSparkCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient, - HoodieWriteConfig config, CompactionOperation operation, String instantTime) throws IOException { - FileSystem fs = metaClient.getFs(); - Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); - LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames() - + " for commit " + instantTime); - // TODO - FIX THIS - // Reads the entire avro file. Always only specific blocks should be read from the avro file - // (failure recover). - // Load all the delta commits since the last compaction commit and get all the blocks to be - // loaded and load it using CompositeAvroLogReader - // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. - String maxInstantTime = metaClient - .getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, - HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)) - .filterCompletedInstants().lastInstant().get().getTimestamp(); - long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new SparkTaskContextSupplier(), config); - LOG.info("MaxMemoryPerCompaction => " + maxMemoryPerCompaction); - - List logFiles = operation.getDeltaFileNames().stream().map( - p -> new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), operation.getPartitionPath()), p).toString()) - .collect(toList()); - HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() - .withFileSystem(fs) - .withBasePath(metaClient.getBasePath()) - .withLogFilePaths(logFiles) - .withReaderSchema(readerSchema) - .withLatestInstantTime(maxInstantTime) - .withMaxMemorySizeInBytes(maxMemoryPerCompaction) - .withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()) - .withReverseReader(config.getCompactionReverseLogReadEnabled()) - .withBufferSize(config.getMaxDFSStreamBufferSize()) - .withSpillableMapBasePath(config.getSpillableMapBasePath()) - .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType()) - .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) - .build(); - if (!scanner.iterator().hasNext()) { - return new ArrayList<>(); - } - - Option oldDataFileOpt = - operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath()); - - // Compacting is very similar to applying updates to existing file - Iterator> result; - // If the dataFile is present, perform updates else perform inserts into a new base file. - if (oldDataFileOpt.isPresent()) { - result = hoodieCopyOnWriteTable.handleUpdate(instantTime, operation.getPartitionPath(), - operation.getFileId(), scanner.getRecords(), - oldDataFileOpt.get()); - } else { - result = hoodieCopyOnWriteTable.handleInsert(instantTime, operation.getPartitionPath(), operation.getFileId(), - scanner.getRecords()); - } - Iterable> resultIterable = () -> result; - return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> { - s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog()); - s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); - s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); - s.getStat().setPartitionPath(operation.getPartitionPath()); - s.getStat() - .setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); - s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks()); - s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks()); - s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks()); - RuntimeStats runtimeStats = new RuntimeStats(); - runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks()); - s.getStat().setRuntimeStats(runtimeStats); - scanner.close(); - }).collect(toList()); } @Override - public HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - HoodieWriteConfig config, String compactionCommitTime, - Set fgIdsInPendingCompactionAndClustering) - throws IOException { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - totalLogFiles = new LongAccumulator(); - totalFileSlices = new LongAccumulator(); - jsc.sc().register(totalLogFiles); - jsc.sc().register(totalFileSlices); - - ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, - "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " - + hoodieTable.getMetaClient().getTableType().name()); - - // TODO : check if maxMemory is not greater than JVM or spark.executor memory - // TODO - rollback any compactions in flight - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); - List partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); - - // filter the partition paths if needed to reduce list status - partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths); - - if (partitionPaths.isEmpty()) { - // In case no partitions could be picked, return no compaction plan - return null; - } - - SliceView fileSystemView = hoodieTable.getSliceView(); - LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); - context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact"); - - List operations = context.flatMap(partitionPaths, partitionPath -> { - return fileSystemView - .getLatestFileSlices(partitionPath) - .filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())) - .map(s -> { - List logFiles = - s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); - totalLogFiles.add((long) logFiles.size()); - totalFileSlices.add(1L); - // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO - // for spark Map operations and collecting them finally in Avro generated classes for storing - // into meta files. - Option dataFile = s.getBaseFile(); - return new CompactionOperation(dataFile, partitionPath, logFiles, - config.getCompactionStrategy().captureMetrics(config, s)); - }) - .filter(c -> !c.getDeltaFileNames().isEmpty()); - }, partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList()); - - LOG.info("Total of " + operations.size() + " compactions are retrieved"); - LOG.info("Total number of latest files slices " + totalFileSlices.value()); - LOG.info("Total number of log files " + totalLogFiles.value()); - LOG.info("Total number of file slices " + totalFileSlices.value()); - // Filter the compactions with the passed in filter. This lets us choose most effective - // compactions only - HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, - CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList())); - ValidationUtils.checkArgument( - compactionPlan.getOperations().stream().noneMatch( - op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), - "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " - + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering - + ", Selected workload :" + compactionPlan); - if (compactionPlan.getOperations().isEmpty()) { - LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath()); - } - return compactionPlan; + public void maybePersist(HoodieData writeStatus, HoodieWriteConfig config) { + HoodieJavaRDD.getJavaRDD(writeStatus).persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps())); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkCompactHelpers.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkCompactHelpers.java deleted file mode 100644 index 107f533f27b44..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkCompactHelpers.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; - -import java.io.IOException; -import java.util.List; - -/** - * A spark implementation of {@link AbstractCompactHelpers}. - * - * @param - */ -public class SparkCompactHelpers extends - AbstractCompactHelpers>, JavaRDD, JavaRDD> { - - private SparkCompactHelpers() { - } - - private static class CompactHelperHolder { - private static final SparkCompactHelpers SPARK_COMPACT_HELPERS = new SparkCompactHelpers(); - } - - public static SparkCompactHelpers newInstance() { - return CompactHelperHolder.SPARK_COMPACT_HELPERS; - } - - @Override - public HoodieCommitMetadata createCompactionMetadata(HoodieTable>, JavaRDD, JavaRDD> table, - String compactionInstantTime, - JavaRDD writeStatuses, - String schema) throws IOException { - byte[] planBytes = table.getActiveTimeline().readCompactionPlanAsBytes( - HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get(); - HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan(planBytes); - List updateStatusMap = writeStatuses.map(WriteStatus::getStat).collect(); - org.apache.hudi.common.model.HoodieCommitMetadata metadata = new org.apache.hudi.common.model.HoodieCommitMetadata(true); - for (HoodieWriteStat stat : updateStatusMap) { - metadata.addWriteStat(stat.getPartitionPath(), stat); - } - metadata.addMetadata(org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY, schema); - if (compactionPlan.getExtraMetadata() != null) { - compactionPlan.getExtraMetadata().forEach(metadata::addMetadata); - } - return metadata; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkScheduleCompactionActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkScheduleCompactionActionExecutor.java deleted file mode 100644 index 9c44499a8f43e..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkScheduleCompactionActionExecutor.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieFileGroupId; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.SyncableFileSystemView; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCompactionException; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; - -import java.io.IOException; -import java.text.ParseException; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkScheduleCompactionActionExecutor extends - BaseScheduleCompactionActionExecutor>, JavaRDD, JavaRDD> { - - private static final Logger LOG = LogManager.getLogger(SparkScheduleCompactionActionExecutor.class); - - public SparkScheduleCompactionActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } - - @Override - protected HoodieCompactionPlan scheduleCompaction() { - LOG.info("Checking if compaction needs to be run on " + config.getBasePath()); - // judge if we need to compact according to num delta commits and time elapsed - boolean compactable = needCompact(config.getInlineCompactTriggerStrategy()); - if (compactable) { - LOG.info("Generating compaction plan for merge on read table " + config.getBasePath()); - HoodieSparkMergeOnReadTableCompactor compactor = new HoodieSparkMergeOnReadTableCompactor(); - try { - SyncableFileSystemView fileSystemView = (SyncableFileSystemView) table.getSliceView(); - Set fgInPendingCompactionAndClustering = fileSystemView.getPendingCompactionOperations() - .map(instantTimeOpPair -> instantTimeOpPair.getValue().getFileGroupId()) - .collect(Collectors.toSet()); - // exclude files in pending clustering from compaction. - fgInPendingCompactionAndClustering.addAll(fileSystemView.getFileGroupsInPendingClustering().map(Pair::getLeft).collect(Collectors.toSet())); - return compactor.generateCompactionPlan(context, table, config, instantTime, fgInPendingCompactionAndClustering); - } catch (IOException e) { - throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e); - } - } - - return new HoodieCompactionPlan(); - } - - public Pair getLatestDeltaCommitInfo(CompactionTriggerStrategy compactionTriggerStrategy) { - Option lastCompaction = table.getActiveTimeline().getCommitTimeline() - .filterCompletedInstants().lastInstant(); - HoodieTimeline deltaCommits = table.getActiveTimeline().getDeltaCommitTimeline(); - - String latestInstantTs; - int deltaCommitsSinceLastCompaction = 0; - if (lastCompaction.isPresent()) { - latestInstantTs = lastCompaction.get().getTimestamp(); - deltaCommitsSinceLastCompaction = deltaCommits.findInstantsAfter(latestInstantTs, Integer.MAX_VALUE).countInstants(); - } else { - latestInstantTs = deltaCommits.firstInstant().get().getTimestamp(); - deltaCommitsSinceLastCompaction = deltaCommits.findInstantsAfterOrEquals(latestInstantTs, Integer.MAX_VALUE).countInstants(); - } - return Pair.of(deltaCommitsSinceLastCompaction, latestInstantTs); - } - - public boolean needCompact(CompactionTriggerStrategy compactionTriggerStrategy) { - boolean compactable; - // get deltaCommitsSinceLastCompaction and lastCompactionTs - Pair latestDeltaCommitInfo = getLatestDeltaCommitInfo(compactionTriggerStrategy); - int inlineCompactDeltaCommitMax = config.getInlineCompactDeltaCommitMax(); - int inlineCompactDeltaSecondsMax = config.getInlineCompactDeltaSecondsMax(); - switch (compactionTriggerStrategy) { - case NUM_COMMITS: - compactable = inlineCompactDeltaCommitMax <= latestDeltaCommitInfo.getLeft(); - if (compactable) { - LOG.info(String.format("The delta commits >= %s, trigger compaction scheduler.", inlineCompactDeltaCommitMax)); - } - break; - case TIME_ELAPSED: - compactable = inlineCompactDeltaSecondsMax <= parsedToSeconds(instantTime) - parsedToSeconds(latestDeltaCommitInfo.getRight()); - if (compactable) { - LOG.info(String.format("The elapsed time >=%ss, trigger compaction scheduler.", inlineCompactDeltaSecondsMax)); - } - break; - case NUM_OR_TIME: - compactable = inlineCompactDeltaCommitMax <= latestDeltaCommitInfo.getLeft() - || inlineCompactDeltaSecondsMax <= parsedToSeconds(instantTime) - parsedToSeconds(latestDeltaCommitInfo.getRight()); - if (compactable) { - LOG.info(String.format("The delta commits >= %s or elapsed_time >=%ss, trigger compaction scheduler.", inlineCompactDeltaCommitMax, - inlineCompactDeltaSecondsMax)); - } - break; - case NUM_AND_TIME: - compactable = inlineCompactDeltaCommitMax <= latestDeltaCommitInfo.getLeft() - && inlineCompactDeltaSecondsMax <= parsedToSeconds(instantTime) - parsedToSeconds(latestDeltaCommitInfo.getRight()); - if (compactable) { - LOG.info(String.format("The delta commits >= %s and elapsed_time >=%ss, trigger compaction scheduler.", inlineCompactDeltaCommitMax, - inlineCompactDeltaSecondsMax)); - } - break; - default: - throw new HoodieCompactionException("Unsupported compaction trigger strategy: " + config.getInlineCompactTriggerStrategy()); - } - return compactable; - } - - public Long parsedToSeconds(String time) { - long timestamp; - try { - timestamp = HoodieActiveTimeline.COMMIT_FORMATTER.parse(time).getTime() / 1000; - } catch (ParseException e) { - throw new HoodieCompactionException(e.getMessage(), e); - } - return timestamp; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkCopyOnWriteRestoreActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkCopyOnWriteRestoreActionExecutor.java deleted file mode 100644 index 101b3217da99e..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkCopyOnWriteRestoreActionExecutor.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.restore; - -import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.rollback.SparkCopyOnWriteRollbackActionExecutor; - -import org.apache.spark.api.java.JavaRDD; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkCopyOnWriteRestoreActionExecutor extends - BaseRestoreActionExecutor>, JavaRDD, JavaRDD> { - - public SparkCopyOnWriteRestoreActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - String restoreInstantTime) { - super(context, config, table, instantTime, restoreInstantTime); - } - - @Override - protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) { - table.getMetaClient().reloadActiveTimeline(); - SparkCopyOnWriteRollbackActionExecutor rollbackActionExecutor = new SparkCopyOnWriteRollbackActionExecutor( - (HoodieSparkEngineContext) context, - config, - table, - HoodieActiveTimeline.createNewInstantTime(), - instantToRollback, - true, - true, - false); - if (!instantToRollback.getAction().equals(HoodieTimeline.COMMIT_ACTION) - && !instantToRollback.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { - throw new HoodieRollbackException("Unsupported action in rollback instant:" + instantToRollback); - } - return rollbackActionExecutor.execute(); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java deleted file mode 100644 index fcb3882b7bf5e..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock.HoodieCommandBlockTypeEnum; -import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieRollbackException; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.PathFilter; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; - -import scala.Tuple2; - -/** - * Performs Rollback of Hoodie Tables. - */ -public class ListingBasedRollbackHelper implements Serializable { - - private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackHelper.class); - - private final HoodieTableMetaClient metaClient; - private final HoodieWriteConfig config; - - public ListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { - this.metaClient = metaClient; - this.config = config; - } - - /** - * Performs all rollback actions that we have collected in parallel. - */ - public List performRollback(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1); - context.setJobStatus(this.getClass().getSimpleName(), "Perform rollback actions"); - JavaPairRDD partitionPathRollbackStatsPairRDD = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, sparkPartitions, true); - return partitionPathRollbackStatsPairRDD.reduceByKey(RollbackUtils::mergeRollbackStat).map(Tuple2::_2).collect(); - } - - /** - * Collect all file info that needs to be rollbacked. - */ - public List collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1); - context.setJobStatus(this.getClass().getSimpleName(), "Collect rollback stats for upgrade/downgrade"); - JavaPairRDD partitionPathRollbackStatsPairRDD = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, sparkPartitions, false); - return partitionPathRollbackStatsPairRDD.map(Tuple2::_2).collect(); - } - - /** - * May be delete interested files and collect stats or collect stats only. - * - * @param context instance of {@link HoodieEngineContext} to use. - * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested. - * @param rollbackRequests List of {@link ListingBasedRollbackRequest} to be operated on. - * @param sparkPartitions number of spark partitions to use for parallelism. - * @param doDelete {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes. - * @return stats collected with or w/o actual deletions. - */ - JavaPairRDD maybeDeleteAndCollectStats(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests, - int sparkPartitions, boolean doDelete) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - return jsc.parallelize(rollbackRequests, sparkPartitions).mapToPair(rollbackRequest -> { - switch (rollbackRequest.getType()) { - case DELETE_DATA_FILES_ONLY: { - final Map filesToDeletedStatus = deleteBaseFiles(metaClient, config, instantToRollback.getTimestamp(), - rollbackRequest.getPartitionPath(), doDelete); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case DELETE_DATA_AND_LOG_FILES: { - final Map filesToDeletedStatus = deleteBaseAndLogFiles(metaClient, config, instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath(), doDelete); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case APPEND_ROLLBACK_BLOCK: { - String fileId = rollbackRequest.getFileId().get(); - String latestBaseInstant = rollbackRequest.getLatestBaseInstant().get(); - - // collect all log files that is supposed to be deleted with this rollback - Map writtenLogFileSizeMap = FSUtils.getAllLogFiles(metaClient.getFs(), - FSUtils.getPartitionPath(config.getBasePath(), rollbackRequest.getPartitionPath()), - fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), latestBaseInstant) - .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen())); - - Writer writer = null; - try { - writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath())) - .withFileId(fileId) - .overBaseCommit(latestBaseInstant) - .withFs(metaClient.getFs()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - - // generate metadata - if (doDelete) { - Map header = generateHeader(instantToRollback.getTimestamp()); - // if update belongs to an existing log file - writer.appendBlock(new HoodieCommandBlock(header)); - } - } catch (IOException | InterruptedException io) { - throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io); - } finally { - try { - if (writer != null) { - writer.close(); - } - } catch (IOException io) { - throw new HoodieIOException("Error appending rollback block..", io); - } - } - - // This step is intentionally done after writer is closed. Guarantees that - // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in - // cloud-storage : HUDI-168 - Map filesToNumBlocksRollback = Collections.singletonMap( - metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), - 1L - ); - - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withRollbackBlockAppendResults(filesToNumBlocksRollback) - .withWrittenLogFileSizeMap(writtenLogFileSizeMap).build()); - } - default: - throw new IllegalStateException("Unknown Rollback action " + rollbackRequest); - } - }); - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseAndLogFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - LOG.info("Cleaning path " + partitionPath); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - SerializablePathFilter filter = (path) -> { - if (path.toString().endsWith(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } else if (FSUtils.isLogFile(path)) { - // Since the baseCommitTime is the only commit for new log files, it's okay here - String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); - return commit.equals(fileCommitTime); - } - return false; - }; - - final Map results = new HashMap<>(); - FileSystem fs = metaClient.getFs(); - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - final Map results = new HashMap<>(); - LOG.info("Cleaning path " + partitionPath); - FileSystem fs = metaClient.getFs(); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - PathFilter filter = (path) -> { - if (path.toString().contains(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } - return false; - }; - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - private Map generateHeader(String commit) { - // generate metadata - Map header = new HashMap<>(3); - header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); - header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit); - header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, - String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); - return header; - } - - public interface SerializablePathFilter extends PathFilter, Serializable { - - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkCopyOnWriteRollbackActionExecutor.java deleted file mode 100644 index 611ec217a7759..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkCopyOnWriteRollbackActionExecutor.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; - -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkCopyOnWriteRollbackActionExecutor extends - BaseCopyOnWriteRollbackActionExecutor>, JavaRDD, JavaRDD> { - public SparkCopyOnWriteRollbackActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public SparkCopyOnWriteRollbackActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new SparkMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant instantToRollback) { - List rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context, - table.getMetaClient().getBasePath(), config); - return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMarkerBasedRollbackStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMarkerBasedRollbackStrategy.java deleted file mode 100644 index 0adacd28cd9ec..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMarkerBasedRollbackStrategy.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.marker.MarkerBasedRollbackUtils; -import org.apache.hudi.table.marker.WriteMarkers; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import scala.Tuple2; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkMarkerBasedRollbackStrategy extends AbstractMarkerBasedRollbackStrategy>, JavaRDD, JavaRDD> { - public SparkMarkerBasedRollbackStrategy(HoodieTable>, JavaRDD, JavaRDD> table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) { - super(table, context, config, instantTime); - } - - @Override - public List execute(HoodieInstant instantToRollback) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - try { - List markerPaths = MarkerBasedRollbackUtils.getAllMarkerPaths( - table, context, instantToRollback.getTimestamp(), config.getRollbackParallelism()); - int parallelism = Math.max(Math.min(markerPaths.size(), config.getRollbackParallelism()), 1); - jsc.setJobGroup(this.getClass().getSimpleName(), "Rolling back using marker files"); - return jsc.parallelize(markerPaths, parallelism) - .map(markerFilePath -> { - String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1); - IOType type = IOType.valueOf(typeStr); - switch (type) { - case MERGE: - return undoMerge(WriteMarkers.stripMarkerSuffix(markerFilePath)); - case APPEND: - return undoAppend(WriteMarkers.stripMarkerSuffix(markerFilePath), instantToRollback); - case CREATE: - return undoCreate(WriteMarkers.stripMarkerSuffix(markerFilePath)); - default: - throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); - } - }) - .mapToPair(rollbackStat -> new Tuple2<>(rollbackStat.getPartitionPath(), rollbackStat)) - .reduceByKey(RollbackUtils::mergeRollbackStat) - .map(Tuple2::_2).collect(); - } catch (Exception e) { - throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); - } - } - - protected Map getWrittenLogFileSizeMap(String partitionPathStr, String baseCommitTime, String fileId) throws IOException { - // collect all log files that is supposed to be deleted with this rollback - return FSUtils.getAllLogFiles(table.getMetaClient().getFs(), - FSUtils.getPartitionPath(config.getBasePath(), partitionPathStr), fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), baseCommitTime) - .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen())); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMergeOnReadRollbackActionExecutor.java deleted file mode 100644 index 9486362558147..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMergeOnReadRollbackActionExecutor.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.io.IOException; -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkMergeOnReadRollbackActionExecutor extends - BaseMergeOnReadRollbackActionExecutor>, JavaRDD, JavaRDD> { - public SparkMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public SparkMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new SparkMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant resolvedInstant) { - List rollbackRequests; - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - try { - rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(resolvedInstant, table, context); - } catch (IOException e) { - throw new HoodieIOException("Error generating rollback requests by file listing.", e); - } - return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, resolvedInstant, rollbackRequests); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngrade.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngrade.java deleted file mode 100644 index 7284db5df4293..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngrade.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.common.config.ConfigProperty; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.HoodieTableVersion; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieUpgradeDowngradeException; - -import java.io.IOException; -import java.util.Map; - -public class SparkUpgradeDowngrade extends AbstractUpgradeDowngrade { - - public SparkUpgradeDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context) { - super(metaClient, config, context); - } - - @Override - public void run(HoodieTableMetaClient metaClient, - HoodieTableVersion toVersion, - HoodieWriteConfig config, - HoodieEngineContext context, - String instantTime) { - try { - new SparkUpgradeDowngrade(metaClient, config, context).run(toVersion, instantTime); - } catch (IOException e) { - throw new HoodieUpgradeDowngradeException("Error during upgrade/downgrade to version:" + toVersion, e); - } - } - - @Override - protected Map upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) { - return new ZeroToOneUpgradeHandler().upgrade(config, context, instantTime); - } else if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.TWO) { - return new OneToTwoUpgradeHandler().upgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true); - } - } - - @Override - protected Map downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.ZERO) { - return new OneToZeroDowngradeHandler().downgrade(config, context, instantTime); - } else if (fromVersion == HoodieTableVersion.TWO && toVersion == HoodieTableVersion.ONE) { - return new TwoToOneDowngradeHandler().downgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java new file mode 100644 index 0000000000000..f943b701757ed --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; + +/** + * Spark upgrade and downgrade helper. + */ +public class SparkUpgradeDowngradeHelper implements BaseUpgradeDowngradeHelper { + + private static final SparkUpgradeDowngradeHelper SINGLETON_INSTANCE = + new SparkUpgradeDowngradeHelper(); + + private SparkUpgradeDowngradeHelper() { + } + + public static SparkUpgradeDowngradeHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { + return HoodieSparkTable.create(config, context); + } + + @Override + public String getPartitionColumns(HoodieWriteConfig config) { + return HoodieSparkUtils.getPartitionColumns(config.getProps()); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java deleted file mode 100644 index 7bf7209c037b3..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackHelper; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackRequest; - -import java.util.List; - -/** - * Upgrade handle to assist in upgrading hoodie table from version 0 to 1. - */ -public class ZeroToOneUpgradeHandler extends BaseZeroToOneUpgradeHandler { - - @Override - HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { - return HoodieSparkTable.create(config, context); - } - - @Override - List getListBasedRollBackStats(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context, Option commitInstantOpt, - List rollbackRequests) { - return new ListingBasedRollbackHelper(metaClient, config) - .collectRollbackStats(context, commitInstantOpt.get(), rollbackRequests); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java new file mode 100644 index 0000000000000..7ba1c9465bfd0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java @@ -0,0 +1,355 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark; + +import scala.collection.JavaConversions; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.HoodieSparkUtils$; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.optimize.ZOrderingUtil; +import org.apache.parquet.io.api.Binary; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.Row$; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.hudi.execution.RangeSampleSort$; +import org.apache.spark.sql.hudi.execution.ZorderingBinarySort; +import org.apache.spark.sql.types.BinaryType; +import org.apache.spark.sql.types.BinaryType$; +import org.apache.spark.sql.types.BooleanType; +import org.apache.spark.sql.types.ByteType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.DoubleType; +import org.apache.spark.sql.types.FloatType; +import org.apache.spark.sql.types.IntegerType; +import org.apache.spark.sql.types.LongType; +import org.apache.spark.sql.types.LongType$; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.ShortType; +import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.StringType$; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType$; +import org.apache.spark.sql.types.TimestampType; +import org.apache.spark.util.SerializableConfiguration; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class ZCurveOptimizeHelper { + + private static final String SPARK_JOB_DESCRIPTION = "spark.job.description"; + + /** + * Create z-order DataFrame directly + * first, map all base type data to byte[8], then create z-order DataFrame + * only support base type data. long,int,short,double,float,string,timestamp,decimal,date,byte + * this method is more effective than createZIndexDataFrameBySample + * + * @param df a spark DataFrame holds parquet files to be read. + * @param zCols z-sort cols + * @param fileNum spark partition num + * @return a dataFrame sorted by z-order. + */ + public static Dataset createZIndexedDataFrameByMapValue(Dataset df, List zCols, int fileNum) { + Map columnsMap = Arrays.stream(df.schema().fields()).collect(Collectors.toMap(e -> e.name(), e -> e)); + int fieldNum = df.schema().fields().length; + List checkCols = zCols.stream().filter(f -> columnsMap.containsKey(f)).collect(Collectors.toList()); + if (zCols.size() != checkCols.size()) { + return df; + } + // only one col to sort, no need to use z-order + if (zCols.size() == 1) { + return df.repartitionByRange(fieldNum, org.apache.spark.sql.functions.col(zCols.get(0))); + } + Map fieldMap = zCols + .stream().collect(Collectors.toMap(e -> Arrays.asList(df.schema().fields()).indexOf(columnsMap.get(e)), e -> columnsMap.get(e))); + // z-sort + JavaRDD sortedRdd = df.toJavaRDD().map(row -> { + List zBytesList = fieldMap.entrySet().stream().map(entry -> { + int index = entry.getKey(); + StructField field = entry.getValue(); + DataType dataType = field.dataType(); + if (dataType instanceof LongType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getLong(index)); + } else if (dataType instanceof DoubleType) { + return ZOrderingUtil.doubleTo8Byte(row.isNullAt(index) ? Double.MAX_VALUE : row.getDouble(index)); + } else if (dataType instanceof IntegerType) { + return ZOrderingUtil.intTo8Byte(row.isNullAt(index) ? Integer.MAX_VALUE : row.getInt(index)); + } else if (dataType instanceof FloatType) { + return ZOrderingUtil.doubleTo8Byte(row.isNullAt(index) ? Float.MAX_VALUE : row.getFloat(index)); + } else if (dataType instanceof StringType) { + return ZOrderingUtil.utf8To8Byte(row.isNullAt(index) ? "" : row.getString(index)); + } else if (dataType instanceof DateType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getDate(index).getTime()); + } else if (dataType instanceof TimestampType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getTimestamp(index).getTime()); + } else if (dataType instanceof ByteType) { + return ZOrderingUtil.byteTo8Byte(row.isNullAt(index) ? Byte.MAX_VALUE : row.getByte(index)); + } else if (dataType instanceof ShortType) { + return ZOrderingUtil.intTo8Byte(row.isNullAt(index) ? Short.MAX_VALUE : row.getShort(index)); + } else if (dataType instanceof DecimalType) { + return ZOrderingUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getDecimal(index).longValue()); + } else if (dataType instanceof BooleanType) { + boolean value = row.isNullAt(index) ? false : row.getBoolean(index); + return ZOrderingUtil.intTo8Byte(value ? 1 : 0); + } else if (dataType instanceof BinaryType) { + return ZOrderingUtil.paddingTo8Byte(row.isNullAt(index) ? new byte[] {0} : (byte[]) row.get(index)); + } + return null; + }).filter(f -> f != null).collect(Collectors.toList()); + byte[][] zBytes = new byte[zBytesList.size()][]; + for (int i = 0; i < zBytesList.size(); i++) { + zBytes[i] = zBytesList.get(i); + } + List zVaules = new ArrayList<>(); + zVaules.addAll(scala.collection.JavaConverters.bufferAsJavaListConverter(row.toSeq().toBuffer()).asJava()); + zVaules.add(ZOrderingUtil.interleaving(zBytes, 8)); + return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(zVaules)); + }).sortBy(f -> new ZorderingBinarySort((byte[]) f.get(fieldNum)), true, fileNum); + + // create new StructType + List newFields = new ArrayList<>(); + newFields.addAll(Arrays.asList(df.schema().fields())); + newFields.add(new StructField("zIndex", BinaryType$.MODULE$, true, Metadata.empty())); + + // create new DataFrame + return df.sparkSession().createDataFrame(sortedRdd, StructType$.MODULE$.apply(newFields)).drop("zIndex"); + } + + public static Dataset createZIndexedDataFrameByMapValue(Dataset df, String zCols, int fileNum) { + if (zCols == null || zCols.isEmpty() || fileNum <= 0) { + return df; + } + return createZIndexedDataFrameByMapValue(df, + Arrays.stream(zCols.split(",")).map(f -> f.trim()).collect(Collectors.toList()), fileNum); + } + + public static Dataset createZIndexedDataFrameBySample(Dataset df, List zCols, int fileNum) { + return RangeSampleSort$.MODULE$.sortDataFrameBySample(df, JavaConversions.asScalaBuffer(zCols), fileNum); + } + + public static Dataset createZIndexedDataFrameBySample(Dataset df, String zCols, int fileNum) { + if (zCols == null || zCols.isEmpty() || fileNum <= 0) { + return df; + } + return createZIndexedDataFrameBySample(df, Arrays.stream(zCols.split(",")).map(f -> f.trim()).collect(Collectors.toList()), fileNum); + } + + /** + * Parse min/max statistics stored in parquet footers for z-sort cols. + * no support collect statistics from timeStampType, since parquet file has not collect the statistics for timeStampType. + * to do adapt for rfc-27 + * + * @param df a spark DataFrame holds parquet files to be read. + * @param cols z-sort cols + * @return a dataFrame holds all statistics info. + */ + public static Dataset getMinMaxValue(Dataset df, List cols) { + Map columnsMap = Arrays.stream(df.schema().fields()).collect(Collectors.toMap(e -> e.name(), e -> e.dataType())); + + List scanFiles = Arrays.asList(df.inputFiles()); + SparkContext sc = df.sparkSession().sparkContext(); + JavaSparkContext jsc = new JavaSparkContext(sc); + + SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration()); + int numParallelism = (scanFiles.size() / 3 + 1); + List> colMinMaxInfos = new ArrayList<>(); + String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION); + try { + String description = "Listing parquet column statistics"; + jsc.setJobDescription(description); + colMinMaxInfos = jsc.parallelize(scanFiles, numParallelism).mapPartitions(paths -> { + Configuration conf = serializableConfiguration.value(); + ParquetUtils parquetUtils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET); + List>> results = new ArrayList<>(); + while (paths.hasNext()) { + String path = paths.next(); + results.add(parquetUtils.readRangeFromParquetMetadata(conf, new Path(path), cols)); + } + return results.stream().flatMap(f -> f.stream()).iterator(); + }).collect(); + } finally { + jsc.setJobDescription(previousJobDescription); + } + + Map>> fileToStatsListMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(e -> e.getFilePath())); + JavaRDD allMetaDataRDD = jsc.parallelize(fileToStatsListMap.values().stream().collect(Collectors.toList()), 1).map(f -> { + int colSize = f.size(); + if (colSize == 0) { + return null; + } else { + List rows = new ArrayList<>(); + rows.add(f.get(0).getFilePath()); + cols.stream().forEach(col -> { + HoodieColumnRangeMetadata currentColRangeMetaData = + f.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(col)).findFirst().orElse(null); + DataType colType = columnsMap.get(col); + if (currentColRangeMetaData == null || colType == null) { + throw new HoodieException(String.format("cannot collect min/max statistics for col: %s", col)); + } + if (colType instanceof IntegerType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof DoubleType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof StringType) { + String minString = new String(((Binary)currentColRangeMetaData.getMinValue()).getBytes()); + String maxString = new String(((Binary)currentColRangeMetaData.getMaxValue()).getBytes()); + rows.add(minString); + rows.add(maxString); + } else if (colType instanceof DecimalType) { + Double minDecimal = Double.parseDouble(currentColRangeMetaData.getStringifier().stringify(Long.valueOf(currentColRangeMetaData.getMinValue().toString()))); + Double maxDecimal = Double.parseDouble(currentColRangeMetaData.getStringifier().stringify(Long.valueOf(currentColRangeMetaData.getMaxValue().toString()))); + rows.add(BigDecimal.valueOf(minDecimal)); + rows.add(BigDecimal.valueOf(maxDecimal)); + } else if (colType instanceof DateType) { + rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getStringifier().stringify((int)currentColRangeMetaData.getMinValue()))); + rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getStringifier().stringify((int)currentColRangeMetaData.getMaxValue()))); + } else if (colType instanceof LongType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof ShortType) { + rows.add(Short.parseShort(currentColRangeMetaData.getMinValue().toString())); + rows.add(Short.parseShort(currentColRangeMetaData.getMaxValue().toString())); + } else if (colType instanceof FloatType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof BinaryType) { + rows.add(((Binary)currentColRangeMetaData.getMinValue()).getBytes()); + rows.add(((Binary)currentColRangeMetaData.getMaxValue()).getBytes()); + } else if (colType instanceof BooleanType) { + rows.add(currentColRangeMetaData.getMinValue()); + rows.add(currentColRangeMetaData.getMaxValue()); + } else if (colType instanceof ByteType) { + rows.add(Byte.valueOf(currentColRangeMetaData.getMinValue().toString())); + rows.add(Byte.valueOf(currentColRangeMetaData.getMaxValue().toString())); + } else { + throw new HoodieException(String.format("Not support type: %s", colType)); + } + rows.add(currentColRangeMetaData.getNumNulls()); + }); + return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(rows)); + } + }).filter(f -> f != null); + List allMetaDataSchema = new ArrayList<>(); + allMetaDataSchema.add(new StructField("file", StringType$.MODULE$, true, Metadata.empty())); + cols.forEach(col -> { + allMetaDataSchema.add(new StructField(col + "_minValue", columnsMap.get(col), true, Metadata.empty())); + allMetaDataSchema.add(new StructField(col + "_maxValue", columnsMap.get(col), true, Metadata.empty())); + allMetaDataSchema.add(new StructField(col + "_num_nulls", LongType$.MODULE$, true, Metadata.empty())); + }); + return df.sparkSession().createDataFrame(allMetaDataRDD, StructType$.MODULE$.apply(allMetaDataSchema)); + } + + public static Dataset getMinMaxValue(Dataset df, String cols) { + List rawCols = Arrays.asList(cols.split(",")).stream().map(f -> f.trim()).collect(Collectors.toList()); + return getMinMaxValue(df, rawCols); + } + + /** + * Update statistics info. + * this method will update old index table by full out join, + * and save the updated table into a new index table based on commitTime. + * old index table will be cleaned also. + * + * @param df a spark DataFrame holds parquet files to be read. + * @param cols z-sort cols. + * @param indexPath index store path. + * @param commitTime current operation commitTime. + * @param validateCommits all validate commits for current table. + * @return + */ + public static void saveStatisticsInfo(Dataset df, String cols, String indexPath, String commitTime, List validateCommits) { + Path savePath = new Path(indexPath, commitTime); + SparkSession spark = df.sparkSession(); + FileSystem fs = FSUtils.getFs(indexPath, spark.sparkContext().hadoopConfiguration()); + Dataset statisticsDF = ZCurveOptimizeHelper.getMinMaxValue(df, cols); + // try to find last validate index table from index path + try { + if (fs.exists(new Path(indexPath))) { + List allIndexTables = Arrays + .stream(fs.listStatus(new Path(indexPath))).filter(f -> f.isDirectory()).map(f -> f.getPath().getName()).collect(Collectors.toList()); + List candidateIndexTables = allIndexTables.stream().filter(f -> validateCommits.contains(f)).sorted().collect(Collectors.toList()); + List residualTables = allIndexTables.stream().filter(f -> !validateCommits.contains(f)).collect(Collectors.toList()); + Option latestIndexData = Option.empty(); + if (!candidateIndexTables.isEmpty()) { + latestIndexData = Option.of(spark.read().load(new Path(indexPath, candidateIndexTables.get(candidateIndexTables.size() - 1)).toString())); + // clean old index table, keep at most 1 index table. + candidateIndexTables.remove(candidateIndexTables.size() - 1); + candidateIndexTables.forEach(f -> { + try { + fs.delete(new Path(indexPath, f)); + } catch (IOException ie) { + throw new HoodieException(ie); + } + }); + } + + // clean residualTables + // retried cluster operations at the same instant time is also considered, + // the residual files produced by retried are cleaned up before save statistics + // save statistics info to index table which named commitTime + residualTables.forEach(f -> { + try { + fs.delete(new Path(indexPath, f)); + } catch (IOException ie) { + throw new HoodieException(ie); + } + }); + + if (latestIndexData.isPresent() && latestIndexData.get().schema().equals(statisticsDF.schema())) { + // update the statistics info + String originalTable = "indexTable_" + java.util.UUID.randomUUID().toString().replace("-", ""); + String updateTable = "updateTable_" + java.util.UUID.randomUUID().toString().replace("-", ""); + latestIndexData.get().registerTempTable(originalTable); + statisticsDF.registerTempTable(updateTable); + // update table by full out join + List columns = Arrays.asList(statisticsDF.schema().fieldNames()); + spark.sql(HoodieSparkUtils$ + .MODULE$.createMergeSql(originalTable, updateTable, JavaConversions.asScalaBuffer(columns))).repartition(1).write().save(savePath.toString()); + } + } else { + statisticsDF.repartition(1).write().mode("overwrite").save(savePath.toString()); + } + } catch (IOException e) { + throw new HoodieException(e); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala index 177de90f335dc..11cc4959442c8 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala @@ -28,12 +28,15 @@ import org.apache.avro.Schema.Type._ import org.apache.avro.generic.GenericData.{Fixed, Record} import org.apache.avro.generic.{GenericData, GenericFixed, GenericRecord} import org.apache.avro.{LogicalTypes, Schema} + import org.apache.spark.sql.Row -import org.apache.spark.sql.avro.{IncompatibleSchemaException, SchemaConverters} +import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ + import org.apache.hudi.AvroConversionUtils._ +import org.apache.hudi.exception.HoodieIncompatibleSchemaException import scala.collection.JavaConverters._ @@ -131,7 +134,7 @@ object AvroConversionHelper { case null => new Timestamp(item.asInstanceOf[Long]) case other => - throw new IncompatibleSchemaException( + throw new HoodieIncompatibleSchemaException( s"Cannot convert Avro logical type $other to Catalyst Timestamp type.") } } @@ -149,7 +152,7 @@ object AvroConversionHelper { converters(i) = converter avroFieldIndexes(i) = avroField.pos() } else if (!sqlField.nullable) { - throw new IncompatibleSchemaException( + throw new HoodieIncompatibleSchemaException( s"Cannot find non-nullable field ${sqlField.name} at path ${path.mkString(".")} " + "in Avro schema\n" + s"Source Avro schema: $sourceAvroSchema.\n" + @@ -254,7 +257,7 @@ object AvroConversionHelper { converted(i) = fieldConverters(i)(item) new GenericRow(converted) } - case _ => throw new IncompatibleSchemaException( + case _ => throw new HoodieIncompatibleSchemaException( s"Cannot convert Avro schema to catalyst type because schema at path " + s"${path.mkString(".")} is not compatible " + s"(avroType = $other, sqlType = $sqlType). \n" + @@ -263,7 +266,7 @@ object AvroConversionHelper { } } case (left, right) => - throw new IncompatibleSchemaException( + throw new HoodieIncompatibleSchemaException( s"Cannot convert Avro schema to catalyst type because schema at path " + s"${path.mkString(".")} is not compatible (avroType = $left, sqlType = $right). \n" + s"Source Avro schema: $sourceAvroSchema.\n" + diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala index ed0ab97424a55..ce39843275815 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.common.config.TypedProperties import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.keygen.constant.KeyGeneratorOptions import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import org.apache.hudi.keygen.{BaseKeyGenerator, CustomAvroKeyGenerator, CustomKeyGenerator, KeyGenerator} @@ -35,6 +36,7 @@ import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal} import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex} +import org.apache.spark.sql.functions._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} @@ -60,12 +62,28 @@ object HoodieSparkUtils extends SparkAdapterSupport { } /** - * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]]. - * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally. + * This method is inspired from [[org.apache.spark.deploy.SparkHadoopUtil]] with some modifications like + * skipping meta paths. */ def globPath(fs: FileSystem, pattern: Path): Seq[Path] = { - Option(fs.globStatus(pattern)).map { statuses => - statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq + // find base path to assist in skipping meta paths + var basePath = pattern.getParent + while (basePath.getName.equals("*")) { + basePath = basePath.getParent + } + + Option(fs.globStatus(pattern)).map { statuses => { + val nonMetaStatuses = statuses.filterNot(entry => { + // skip all entries in meta path + var leafPath = entry.getPath + // walk through every parent until we reach base path. if .hoodie is found anywhere, path needs to be skipped + while (!leafPath.equals(basePath) && !leafPath.getName.equals(HoodieTableMetaClient.METAFOLDER_NAME)) { + leafPath = leafPath.getParent + } + leafPath.getName.equals(HoodieTableMetaClient.METAFOLDER_NAME) + }) + nonMetaStatuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq + } }.getOrElse(Seq.empty[Path]) } @@ -88,8 +106,7 @@ object HoodieSparkUtils extends SparkAdapterSupport { def checkAndGlobPathIfNecessary(paths: Seq[String], fs: FileSystem): Seq[Path] = { paths.flatMap(path => { val qualified = new Path(path).makeQualified(fs.getUri, fs.getWorkingDirectory) - val globPaths = globPathIfNecessary(fs, qualified) - globPaths + globPathIfNecessary(fs, qualified) }) } @@ -268,4 +285,43 @@ object HoodieSparkUtils extends SparkAdapterSupport { s"${tableSchema.fieldNames.mkString(",")}") AttributeReference(columnName, field.get.dataType, field.get.nullable)() } + + /** + * Create merge sql to merge leftTable and right table. + * + * @param leftTable table name. + * @param rightTable table name. + * @param cols merged cols. + * @return merge sql. + */ + def createMergeSql(leftTable: String, rightTable: String, cols: Seq[String]): String = { + var selectsql = "" + for (i <- (0 to cols.size-1)) { + selectsql = selectsql + s" if (${leftTable}.${cols(0)} is null, ${rightTable}.${cols(i)}, ${leftTable}.${cols(i)}) as ${cols(i)} ," + } + "select " + selectsql.dropRight(1) + s" from ${leftTable} full join ${rightTable} on ${leftTable}.${cols(0)} = ${rightTable}.${cols(0)}" + } + + /** + * Collect min/max statistics for candidate cols. + * support all col types. + * + * @param df dataFrame holds read files. + * @param cols candidate cols to collect statistics. + * @return + */ + def getMinMaxValueSpark(df: DataFrame, cols: Seq[String]): DataFrame = { + val sqlContext = df.sparkSession.sqlContext + import sqlContext.implicits._ + + val values = cols.flatMap(c => Seq( min(col(c)).as(c + "_minValue"), max(col(c)).as(c + "_maxValue"), count(c).as(c + "_noNullCount"))) + val valueCounts = count("*").as("totalNum") + val projectValues = Seq(col("file")) ++ cols.flatMap(c => + Seq(col(c + "_minValue"), col(c + "_maxValue"), expr(s"totalNum - ${c + "_noNullCount"}").as(c + "_num_nulls"))) + + val result = df.select(input_file_name() as "file", col("*")) + .groupBy($"file") + .agg(valueCounts, values: _*).select(projectValues:_*) + result + } } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala index 2c59495b0d0d2..a93331ee20d48 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala @@ -87,4 +87,9 @@ trait SparkAdapter extends Serializable { * Create Like expression. */ def createLike(left: Expression, right: Expression): Expression + + /** + * ParserInterface#parseMultipartIdentifier is supported since spark3, for spark2 this should not be called. + */ + def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala new file mode 100644 index 0000000000000..da993b7545e53 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala @@ -0,0 +1,526 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.execution + +import java.util + +import org.apache.hudi.config.HoodieClusteringConfig +import org.apache.spark.rdd.{PartitionPruningRDD, RDD} +import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, BoundReference, SortOrder, UnsafeProjection, UnsafeRow} +import org.apache.hudi.optimize.ZOrderingUtil +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering +import org.apache.spark.sql.types._ +import org.apache.spark.util.MutablePair +import org.apache.spark.util.random.SamplingUtils + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.reflect.{ClassTag, classTag} +import scala.util.hashing.byteswap32 + +class RangeSample[K: ClassTag, V]( + zEncodeNum: Int, + rdd: RDD[_ <: Product2[K, V]], + private var ascend: Boolean = true, + val samplePointsPerPartitionHint: Int = 20) extends Serializable { + + // We allow zEncodeNum = 0, which happens when sorting an empty RDD under the default settings. + require(zEncodeNum >= 0, s"Number of zEncodeNum cannot be negative but found $zEncodeNum.") + require(samplePointsPerPartitionHint > 0, + s"Sample points per partition must be greater than 0 but found $samplePointsPerPartitionHint") + + def getRangeBounds(): ArrayBuffer[(K, Float)] = { + if (zEncodeNum <= 1) { + ArrayBuffer.empty[(K, Float)] + } else { + // This is the sample size we need to have roughly balanced output partitions, capped at 1M. + // Cast to double to avoid overflowing ints or longs + val sampleSize = math.min(samplePointsPerPartitionHint.toDouble * zEncodeNum, 1e6) + // Assume the input partitions are roughly balanced and over-sample a little bit. + val sampleSizePerPartition = math.ceil(3.0 * sampleSize / rdd.partitions.length).toInt + val (numItems, sketched) = sketch(rdd.map(_._1), sampleSizePerPartition) + if (numItems == 0L) { + ArrayBuffer.empty[(K, Float)] + } else { + // If a partition contains much more than the average number of items, we re-sample from it + // to ensure that enough items are collected from that partition. + val fraction = math.min(sampleSize / math.max(numItems, 1L), 1.0) + val candidates = ArrayBuffer.empty[(K, Float)] + val imbalancedPartitions = mutable.Set.empty[Int] + + sketched.foreach { case (idx, n, sample) => + if (fraction * n > sampleSizePerPartition) { + imbalancedPartitions += idx + } else { + // The weight is 1 over the sampling probability. + val weight = (n.toDouble / sample.length).toFloat + for (key <- sample) { + candidates += ((key, weight)) + } + } + } + + if (imbalancedPartitions.nonEmpty) { + // Re-sample imbalanced partitions with the desired sampling probability. + val imbalanced = new PartitionPruningRDD(rdd.map(_._1), imbalancedPartitions.contains) + val seed = byteswap32(-rdd.id - 1) + val reSampled = imbalanced.sample(withReplacement = false, fraction, seed).collect() + val weight = (1.0 / fraction).toFloat + candidates ++= reSampled.map(x => (x, weight)) + } + candidates + } + } + } + + /** + * Determines the bounds for range partitioning from candidates with weights indicating how many + * items each represents. Usually this is 1 over the probability used to sample this candidate. + * + * @param candidates unordered candidates with weights + * @param partitions number of partitions + * @return selected bounds + */ + def determineBound[K : Ordering : ClassTag]( + candidates: ArrayBuffer[(K, Float)], + partitions: Int, ordering: Ordering[K]): Array[K] = { + val ordered = candidates.sortBy(_._1)(ordering) + val numCandidates = ordered.size + val sumWeights = ordered.map(_._2.toDouble).sum + val step = sumWeights / partitions + var cumWeight = 0.0 + var target = step + val bounds = ArrayBuffer.empty[K] + var i = 0 + var j = 0 + var previousBound = Option.empty[K] + while ((i < numCandidates) && (j < partitions - 1)) { + val (key, weight) = ordered(i) + cumWeight += weight + if (cumWeight >= target) { + // Skip duplicate values. + if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) { + bounds += key + target += step + j += 1 + previousBound = Some(key) + } + } + i += 1 + } + bounds.toArray + } + + def determineRowBounds[K : Ordering : ClassTag]( + candidates: ArrayBuffer[(K, Float)], + partitions: Int, orderings: Seq[Ordering[K]], + attributes: Seq[Attribute]): Array[Array[UnsafeRow]] = { + + orderings.zipWithIndex.map { case (ordering, index) => + val ordered = candidates.sortBy(_._1)(ordering) + val numCandidates = ordered.size + val sumWeights = ordered.map(_._2.toDouble).sum + val step = sumWeights / partitions + var cumWeight = 0.0 + var target = step + val bounds = ArrayBuffer.empty[K] + var i = 0 + var j = 0 + var previousBound = Option.empty[K] + while ((i < numCandidates) && (j < partitions - 1)) { + val (key, weight) = ordered(i) + cumWeight += weight + if (cumWeight >= target) { + // Skip duplicate values. + if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) { + bounds += key + target += step + j += 1 + previousBound = Some(key) + } + } + i += 1 + } + // build project + val project = UnsafeProjection.create(Seq(attributes(index)), attributes) + bounds.map { bound => + val row = bound.asInstanceOf[UnsafeRow] + project(row).copy() + }.toArray + }.toArray + } + + /** + * Sketches the input RDD via reservoir sampling on each partition. + * + * @param rdd the input RDD to sketch + * @param sampleSizePerPartition max sample size per partition + * @return (total number of items, an array of (partitionId, number of items, sample)) + */ + def sketch[K: ClassTag]( + rdd: RDD[K], + sampleSizePerPartition: Int): (Long, Array[(Int, Long, Array[K])]) = { + val shift = rdd.id + // val classTagK = classTag[K] // to avoid serializing the entire partitioner object + val sketched = rdd.mapPartitionsWithIndex { (idx, iter) => + val seed = byteswap32(idx ^ (shift << 16)) + val (sample, n) = SamplingUtils.reservoirSampleAndCount( + iter, sampleSizePerPartition, seed) + Iterator((idx, n, sample)) + }.collect() + val numItems = sketched.map(_._2).sum + (numItems, sketched) + } +} + +class RawDecisionBound[K : Ordering : ClassTag](ordering: Ordering[K]) extends Serializable { + + private var binarySearch: ((Array[K], K) => Int) = { + // For primitive keys, we can use the natural ordering. Otherwise, use the Ordering comparator. + classTag[K] match { + case ClassTag.Float => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Float]], x.asInstanceOf[Float]) + case ClassTag.Double => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Double]], x.asInstanceOf[Double]) + case ClassTag.Byte => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Byte]], x.asInstanceOf[Byte]) + case ClassTag.Char => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Char]], x.asInstanceOf[Char]) + case ClassTag.Short => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Short]], x.asInstanceOf[Short]) + case ClassTag.Int => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Int]], x.asInstanceOf[Int]) + case ClassTag.Long => + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Long]], x.asInstanceOf[Long]) + case _ => + val comparator = ordering.asInstanceOf[java.util.Comparator[Any]] + (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x, comparator) + } + } + + def getBound(key: Any, candidateBounds: Array[K]): Int = { + val k = key.asInstanceOf[K] + var bound = 0 + if (candidateBounds.length <= 128) { + while(bound < candidateBounds.length && ordering.gt(k, candidateBounds(bound))) { + bound += 1 + } + } else { + bound = binarySearch(candidateBounds, k) + if (bound < 0 ) { + bound = -bound - 1 + } + if (bound > candidateBounds.length) { + bound = candidateBounds.length + } + } + bound + } +} + +case class ZorderingBinarySort(b: Array[Byte]) extends Ordered[ZorderingBinarySort] with Serializable { + override def compare(that: ZorderingBinarySort): Int = { + val len = this.b.length + ZOrderingUtil.compareTo(this.b, 0, len, that.b, 0, len) + } +} + +object RangeSampleSort { + + /** + * create z-order DataFrame by sample + * support all col types + */ + def sortDataFrameBySampleSupportAllTypes(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val spark = df.sparkSession + val internalRdd = df.queryExecution.toRdd + val schema = df.schema + val outputAttributes = df.queryExecution.analyzed.output + val sortingExpressions = outputAttributes.filter(p => zCols.contains(p.name)) + if (sortingExpressions.length == 0 || sortingExpressions.length != zCols.size) { + df + } else { + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.key, + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.defaultValue.toString).toInt + + val sampleRdd = internalRdd.mapPartitionsInternal { iter => + val projection = UnsafeProjection.create(sortingExpressions, outputAttributes) + val mutablePair = new MutablePair[InternalRow, Null]() + // Internally, RangePartitioner runs a job on the RDD that samples keys to compute + // partition bounds. To get accurate samples, we need to copy the mutable keys. + iter.map(row => mutablePair.update(projection(row).copy(), null)) + } + + val orderings = sortingExpressions.map(SortOrder(_, Ascending)).zipWithIndex.map { case (ord, i) => + ord.copy(child = BoundReference(i, ord.dataType, ord.nullable)) + } + + val lazyGeneratedOrderings = orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + val sample = new RangeSample(zOrderBounds, sampleRdd) + + val rangeBounds = sample.getRangeBounds() + + implicit val ordering1 = lazyGeneratedOrderings(0) + + val sampleBounds = sample.determineRowBounds(rangeBounds, math.min(zOrderBounds, rangeBounds.length), lazyGeneratedOrderings, sortingExpressions) + + val origin_orderings = sortingExpressions.map(SortOrder(_, Ascending)).map { ord => + ord.copy(child = BoundReference(0, ord.dataType, ord.nullable)) + } + + val origin_lazyGeneratedOrderings = origin_orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size.toDouble + (bound, fillFactor) + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = internalRdd.mapPartitionsInternal { iter => + val boundsWithFactor = boundBroadCast.value + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + val maxBoundNum = boundsWithFactor.map(_._1.length).max + val origin_Projections = sortingExpressions.map { se => + UnsafeProjection.create(Seq(se), outputAttributes) + } + + iter.map { unsafeRow => + val interleaveValues = origin_Projections.zip(origin_lazyGeneratedOrderings).zipWithIndex.map { case ((rowProject, lazyOrdering), index) => + val row = rowProject(unsafeRow) + val decisionBound = new RawDecisionBound(lazyOrdering) + if (row.isNullAt(0)) { + maxBoundNum + 1 + } else { + val (bound, factor) = boundsWithFactor(index) + if (factor > 1) { + val currentRank = decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + currentRank*factor.toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + } + } + }.toArray.map(ZOrderingUtil.intTo8Byte(_)) + val zValues = ZOrderingUtil.interleaving(interleaveValues, 8) + val mutablePair = new MutablePair[InternalRow, Array[Byte]]() + + mutablePair.update(unsafeRow, zValues) + } + }.sortBy(x => ZorderingBinarySort(x._2), numPartitions = fileNum).map(_._1) + spark.internalCreateDataFrame(indexRdd, schema) + } + } + + /** + * create z-order DataFrame by sample + * first, sample origin data to get z-cols bounds, then create z-order DataFrame + * support all type data. + * this method need more resource and cost more time than createZIndexedDataFrameByMapValue + */ + def sortDataFrameBySample(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val spark = df.sparkSession + val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap + val fieldNum = df.schema.fields.length + val checkCols = zCols.filter(col => columnsMap(col) != null) + + if (zCols.isEmpty || checkCols.isEmpty) { + df + } else { + val zFields = zCols.map { col => + val newCol = columnsMap(col) + if (newCol == null) { + (-1, null) + } else { + newCol.dataType match { + case LongType | DoubleType | FloatType | StringType | IntegerType | DateType | TimestampType | ShortType | ByteType => + (df.schema.fields.indexOf(newCol), newCol) + case d: DecimalType => + (df.schema.fields.indexOf(newCol), newCol) + case _ => + (-1, null) + } + } + }.filter(_._1 != -1) + // Complex type found, use createZIndexedDataFrameByRange + if (zFields.length != zCols.length) { + return sortDataFrameBySampleSupportAllTypes(df, zCols, fieldNum) + } + + val rawRdd = df.rdd + val sampleRdd = rawRdd.map { row => + val values = zFields.map { case (index, field) => + field.dataType match { + case LongType => + if (row.isNullAt(index)) Long.MaxValue else row.getLong(index) + case DoubleType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getDouble(index)) + case IntegerType => + if (row.isNullAt(index)) Long.MaxValue else row.getInt(index).toLong + case FloatType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble) + case StringType => + if (row.isNullAt(index)) "" else row.getString(index) + case DateType => + if (row.isNullAt(index)) Long.MaxValue else row.getDate(index).getTime + case TimestampType => + if (row.isNullAt(index)) Long.MaxValue else row.getTimestamp(index).getTime + case ByteType => + if (row.isNullAt(index)) Long.MaxValue else row.getByte(index).toLong + case ShortType => + if (row.isNullAt(index)) Long.MaxValue else row.getShort(index).toLong + case d: DecimalType => + if (row.isNullAt(index)) Long.MaxValue else row.getDecimal(index).longValue() + case _ => + null + } + }.filter(v => v != null).toArray + (values, null) + } + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.key, + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.defaultValue.toString).toInt + val sample = new RangeSample(zOrderBounds, sampleRdd) + val rangeBounds = sample.getRangeBounds() + val sampleBounds = { + val candidateColNumber = rangeBounds.head._1.length + (0 to candidateColNumber - 1).map { i => + val colRangeBound = rangeBounds.map(x => (x._1(i), x._2)) + + if (colRangeBound.head._1.isInstanceOf[String]) { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(String, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[String]) + } else { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(Long, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[Long]) + } + } + } + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size + val newBound = new Array[Double](bound.length * fillFactor) + if (bound.isInstanceOf[Array[Long]] && fillFactor > 1) { + val longBound = bound.asInstanceOf[Array[Long]] + for (i <- 0 to bound.length - 1) { + for (j <- 0 to fillFactor - 1) { + // sample factor shoud not be too large, so it's ok to use 1 / fillfactor as slice + newBound(j + i*(fillFactor)) = longBound(i) + (j + 1) * (1 / fillFactor.toDouble) + } + } + (newBound, fillFactor) + } else { + (bound, 0) + } + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = rawRdd.mapPartitions { iter => + val expandBoundsWithFactor = boundBroadCast.value + val maxBoundNum = expandBoundsWithFactor.map(_._1.length).max + val longDecisionBound = new RawDecisionBound(Ordering[Long]) + val doubleDecisionBound = new RawDecisionBound(Ordering[Double]) + val stringDecisionBound = new RawDecisionBound(Ordering[String]) + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + + def getRank(rawIndex: Int, value: Long, isNull: Boolean): Int = { + val (expandBound, factor) = expandBoundsWithFactor(rawIndex) + if (isNull) { + expandBound.length + 1 + } else { + if (factor > 1) { + doubleDecisionBound.getBound(value + (threadLocalRandom.nextInt(factor) + 1)*(1 / factor.toDouble), expandBound.asInstanceOf[Array[Double]]) + } else { + longDecisionBound.getBound(value, expandBound.asInstanceOf[Array[Long]]) + } + } + } + + iter.map { row => + val values = zFields.zipWithIndex.map { case ((index, field), rawIndex) => + field.dataType match { + case LongType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getLong(index), isNull) + case DoubleType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getDouble(index)), isNull) + case IntegerType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getInt(index).toLong, isNull) + case FloatType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble), isNull) + case StringType => + val factor = maxBoundNum.toDouble / expandBoundsWithFactor(rawIndex)._1.length + if (row.isNullAt(index)) { + maxBoundNum + 1 + } else { + val currentRank = stringDecisionBound.getBound(row.getString(index), expandBoundsWithFactor(rawIndex)._1.asInstanceOf[Array[String]]) + if (factor > 1) { + (currentRank*factor).toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + currentRank + } + } + case DateType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDate(index).getTime, isNull) + case TimestampType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getTimestamp(index).getTime, isNull) + case ByteType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getByte(index).toLong, isNull) + case ShortType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getShort(index).toLong, isNull) + case d: DecimalType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDecimal(index).longValue(), isNull) + case _ => + -1 + } + }.filter(v => v != -1).map(ZOrderingUtil.intTo8Byte(_)).toArray + val zValues = ZOrderingUtil.interleaving(values, 8) + Row.fromSeq(row.toSeq ++ Seq(zValues)) + } + }.sortBy(x => ZorderingBinarySort(x.getAs[Array[Byte]](fieldNum)), numPartitions = fileNum) + val newDF = df.sparkSession.createDataFrame(indexRdd, StructType( + df.schema.fields ++ Seq( + StructField(s"zindex", + BinaryType, false)) + )) + newDF.drop("zindex") + } + } +} + diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java index c7599aac0c83d..c70a2cf6a5832 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java @@ -23,6 +23,7 @@ import java.util.stream.Collectors; import org.apache.hadoop.fs.Path; import org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass; +import org.apache.hudi.common.config.LockConfiguration; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieTableType; @@ -42,6 +43,8 @@ import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; @@ -123,15 +126,27 @@ public void testHoodieClientBasicMultiWriter(HoodieTableType tableType) throws E } } - @ParameterizedTest - @EnumSource(value = HoodieTableType.class, names = {"COPY_ON_WRITE", "MERGE_ON_READ"}) - public void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType tableType) throws Exception { + @Disabled + public void testMultiWriterWithAsyncTableServicesWithConflictCOW() throws Exception { + testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType.COPY_ON_WRITE); + } + + @Test + public void testMultiWriterWithAsyncTableServicesWithConflictMOR() throws Exception { + testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType.MERGE_ON_READ); + } + + private void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType tableType) throws Exception { // create inserts X 1 if (tableType == HoodieTableType.MERGE_ON_READ) { setUpMORTestTable(); } Properties properties = new Properties(); properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "3"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "5000"); // Disabling embedded timeline server, it doesn't work with multiwriter HoodieWriteConfig cfg = getConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder().withAutoClean(false) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java index 83761c985b040..457b8b526aa04 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java @@ -18,6 +18,7 @@ package org.apache.hudi.client; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieRecord; @@ -30,6 +31,7 @@ import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieClientTestUtils; @@ -67,9 +69,15 @@ public void tearDown() throws Exception { } protected HoodieWriteConfig getHoodieWriteConfig(String basePath) { + return getHoodieWriteConfig(basePath, HoodieMetadataConfig.ENABLE.defaultValue()); + } + + protected HoodieWriteConfig getHoodieWriteConfig(String basePath, boolean enableMetadata) { return HoodieWriteConfig.newBuilder().withPath(basePath).withEmbeddedTimelineServerEnabled(true) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata).build()) + .build(); } @Test @@ -82,8 +90,17 @@ public void readLocalWriteHDFS() throws Exception { .initTable(hadoopConf, dfsBasePath); // Create write client to write some records in - HoodieWriteConfig cfg = getHoodieWriteConfig(dfsBasePath); - HoodieWriteConfig localConfig = getHoodieWriteConfig(tablePath); + HoodieWriteConfig cfg = getHoodieWriteConfig(dfsBasePath, false); + HoodieWriteConfig localConfig = getHoodieWriteConfig(tablePath, false); + + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(tableName) + .setPayloadClass(HoodieAvroPayload.class) + .setRecordKeyFields(localConfig.getProps().getProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())) + .setPartitionFields(localConfig.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) + .initTable(hadoopConf, tablePath); + try (SparkRDDWriteClient hdfsWriteClient = getHoodieWriteClient(cfg); SparkRDDWriteClient localWriteClient = getHoodieWriteClient(localConfig)) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHBaseIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHBaseIndex.java index 3ad777475f496..db17ceae92af6 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHBaseIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHBaseIndex.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -155,7 +156,7 @@ public void testSimpleTagLocationAndUpdate(HoodieTableType tableType) throws Exc HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Test tagLocation without any entries in index - JavaRDD records1 = index.tagLocation(writeRecords, context(), hoodieTable); + JavaRDD records1 = tagLocation(index, writeRecords, hoodieTable); assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); // Insert 200 records @@ -164,7 +165,7 @@ public void testSimpleTagLocationAndUpdate(HoodieTableType tableType) throws Exc assertNoWriteErrors(writeStatues.collect()); // Now tagLocation for these records, hbaseIndex should not tag them since commit never occurred - JavaRDD records2 = index.tagLocation(writeRecords, context(), hoodieTable); + JavaRDD records2 = tagLocation(index, writeRecords, hoodieTable); assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count()); // Now commit this & update location of records inserted and validate no errors @@ -172,7 +173,7 @@ public void testSimpleTagLocationAndUpdate(HoodieTableType tableType) throws Exc // Now tagLocation for these records, hbaseIndex should tag them correctly metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null @@ -206,17 +207,17 @@ public void testTagLocationAndPartitionPathUpdate() throws Exception { metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD oldHoodieRecord = index.tagLocation(oldWriteRecords, context, hoodieTable); + JavaRDD oldHoodieRecord = tagLocation(index, oldWriteRecords, hoodieTable); assertEquals(0, oldHoodieRecord.filter(record -> record.isCurrentLocationKnown()).count()); writeClient.startCommitWithTime(newCommitTime); JavaRDD writeStatues = writeClient.upsert(oldWriteRecords, newCommitTime); writeClient.commit(newCommitTime, writeStatues); assertNoWriteErrors(writeStatues.collect()); - index.updateLocation(writeStatues, context, hoodieTable); + updateLocation(index, writeStatues, hoodieTable); metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List taggedRecords = index.tagLocation(newWriteRecords, context, hoodieTable).collect(); + List taggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); assertEquals(numRecords * 2L, taggedRecords.stream().count()); // Verify the number of deleted records assertEquals(numRecords, taggedRecords.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) @@ -226,7 +227,7 @@ public void testTagLocationAndPartitionPathUpdate() throws Exception { // not allowed path change test index = new SparkHoodieHBaseIndex<>(getConfig(false, false)); - List notAllowPathChangeRecords = index.tagLocation(newWriteRecords, context, hoodieTable).collect(); + List notAllowPathChangeRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); assertEquals(numRecords, notAllowPathChangeRecords.stream().count()); assertEquals(numRecords, taggedRecords.stream().filter(hoodieRecord -> hoodieRecord.isCurrentLocationKnown() && hoodieRecord.getKey().getPartitionPath().equals(oldPartitionPath)).count()); @@ -249,7 +250,7 @@ public void testTagLocationAndDuplicateUpdate() throws Exception { HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - index.tagLocation(writeRecords, context(), hoodieTable); + tagLocation(index, writeRecords, hoodieTable); // Duplicate upsert and ensure correctness is maintained // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not @@ -265,7 +266,7 @@ public void testTagLocationAndDuplicateUpdate() throws Exception { // Now tagLocation for these records, hbaseIndex should tag them correctly metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List taggedRecords = index.tagLocation(writeRecords, context(), hoodieTable).collect(); + List taggedRecords = tagLocation(index, writeRecords, hoodieTable).collect(); assertEquals(numRecords, taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); assertEquals(numRecords, taggedRecords.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); assertEquals(numRecords, taggedRecords.stream().filter(record -> (record.getCurrentLocation() != null @@ -294,22 +295,22 @@ public void testTagLocationAndPartitionPathUpdateWithExplicitRollback() throws E // first commit old record metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List beforeFirstTaggedRecords = index.tagLocation(oldWriteRecords, context, hoodieTable).collect(); + List beforeFirstTaggedRecords = tagLocation(index, oldWriteRecords, hoodieTable).collect(); JavaRDD oldWriteStatues = writeClient.upsert(oldWriteRecords, firstCommitTime); - index.updateLocation(oldWriteStatues, context, hoodieTable); + updateLocation(index, oldWriteStatues, hoodieTable); writeClient.commit(firstCommitTime, oldWriteStatues); - List afterFirstTaggedRecords = index.tagLocation(oldWriteRecords, context, hoodieTable).collect(); + List afterFirstTaggedRecords = tagLocation(index, oldWriteRecords, hoodieTable).collect(); metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); final String secondCommitTime = writeClient.startCommit(); - List beforeSecondTaggedRecords = index.tagLocation(newWriteRecords, context, hoodieTable).collect(); + List beforeSecondTaggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); JavaRDD newWriteStatues = writeClient.upsert(newWriteRecords, secondCommitTime); - index.updateLocation(newWriteStatues, context, hoodieTable); + updateLocation(index, newWriteStatues, hoodieTable); writeClient.commit(secondCommitTime, newWriteStatues); - List afterSecondTaggedRecords = index.tagLocation(newWriteRecords, context, hoodieTable).collect(); + List afterSecondTaggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); writeClient.rollback(secondCommitTime); - List afterRollback = index.tagLocation(newWriteRecords, context, hoodieTable).collect(); + List afterRollback = tagLocation(index, newWriteRecords, hoodieTable).collect(); // Verify the first commit assertEquals(numRecords, beforeFirstTaggedRecords.stream().filter(record -> record.getCurrentLocation() == null).count()); @@ -335,7 +336,8 @@ public void testTagLocationAndPartitionPathUpdateWithExplicitRollback() throws E @Test public void testSimpleTagLocationAndUpdateWithRollback() throws Exception { // Load to memory - HoodieWriteConfig config = getConfig(); + HoodieWriteConfig config = getConfigBuilder(100, false, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); SparkRDDWriteClient writeClient = getHoodieWriteClient(config); @@ -353,7 +355,7 @@ public void testSimpleTagLocationAndUpdateWithRollback() throws Exception { writeClient.commit(newCommitTime, writeStatues); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Now tagLocation for these records, hbaseIndex should tag them - List records2 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); + List records2 = tagLocation(index, writeRecords, hoodieTable).collect(); assertEquals(numRecords, records2.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); // check tagged records are tagged with correct fileIds @@ -369,7 +371,7 @@ public void testSimpleTagLocationAndUpdateWithRollback() throws Exception { hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled // back commit - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); assertEquals(0, records3.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); assertEquals(0, records3.stream().filter(record -> record.getCurrentLocation() != null).count()); } @@ -395,7 +397,7 @@ public void testSimpleTagLocationWithInvalidCommit() throws Exception { // verify location is tagged. HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD javaRDD0 = index.tagLocation(invalidWriteRecords, context(), hoodieTable); + JavaRDD javaRDD0 = tagLocation(index, invalidWriteRecords, hoodieTable); assert (javaRDD0.collect().size() == 1); // one record present assert (javaRDD0.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 1); // it is tagged assert (javaRDD0.collect().get(0).getCurrentLocation().getInstantTime().equals(invalidCommit)); @@ -406,11 +408,11 @@ public void testSimpleTagLocationWithInvalidCommit() throws Exception { // Now tagLocation for the valid records, hbaseIndex should tag them metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD javaRDD1 = index.tagLocation(writeRecords, context(), hoodieTable); + JavaRDD javaRDD1 = tagLocation(index, writeRecords, hoodieTable); assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 199); // tagLocation for the invalid record - commit is not present in timeline due to rollback. - JavaRDD javaRDD2 = index.tagLocation(invalidWriteRecords, context(), hoodieTable); + JavaRDD javaRDD2 = tagLocation(index, invalidWriteRecords, hoodieTable); assert (javaRDD2.collect().size() == 1); // one record present assert (javaRDD2.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0); // it is not tagged } @@ -422,7 +424,8 @@ public void testSimpleTagLocationWithInvalidCommit() throws Exception { @Test public void testEnsureTagLocationUsesCommitTimeline() throws Exception { // Load to memory - HoodieWriteConfig config = getConfig(); + HoodieWriteConfig config = getConfigBuilder(100, false, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); SparkRDDWriteClient writeClient = getHoodieWriteClient(config); @@ -439,7 +442,7 @@ public void testEnsureTagLocationUsesCommitTimeline() throws Exception { // Now tagLocation for the first set of rolledback records, hbaseIndex should tag them metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD javaRDD1 = index.tagLocation(writeRecords1, context(), hoodieTable); + JavaRDD javaRDD1 = tagLocation(index, writeRecords1, hoodieTable); assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 20); } @@ -489,7 +492,7 @@ public void testHbaseTagLocationForArchivedCommits() throws Exception { // tagLocation for the first set of records (for the archived commit), hbaseIndex should tag them as valid metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD javaRDD1 = index.tagLocation(writeRecords1, context(), hoodieTable); + JavaRDD javaRDD1 = tagLocation(index, writeRecords1, hoodieTable); assertEquals(20, javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size()); } @@ -521,7 +524,7 @@ public void testTotalGetsBatching() throws Exception { assertNoWriteErrors(writeStatues.collect()); // Now tagLocation for these records, hbaseIndex should tag them - index.tagLocation(writeRecords, context(), hoodieTable); + tagLocation(index, writeRecords, hoodieTable); // 3 batches should be executed given batchSize = 100 and parallelism = 1 verify(table, times(3)).get((List) any()); @@ -559,7 +562,7 @@ public void testTotalPutsBatching() throws Exception { // Get all the files generated int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count(); - index.updateLocation(writeStatues, context(), hoodieTable); + updateLocation(index, writeStatues, hoodieTable); // 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated, // so each fileId ideally gets updates verify(table, atMost(numberOfDataFileIds)).put((List) any()); @@ -693,7 +696,7 @@ public void testSmallBatchSize() throws Exception { HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Test tagLocation without any entries in index - JavaRDD records1 = index.tagLocation(writeRecords, context(), hoodieTable); + JavaRDD records1 = tagLocation(index, writeRecords, hoodieTable); assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); // Insert 200 records writeClient.startCommitWithTime(newCommitTime); @@ -702,7 +705,7 @@ public void testSmallBatchSize() throws Exception { // Now tagLocation for these records, hbaseIndex should not tag them since it was a failed // commit - JavaRDD records2 = index.tagLocation(writeRecords, context(), hoodieTable); + JavaRDD records2 = tagLocation(index, writeRecords, hoodieTable); assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count()); // Now commit this & update location of records inserted and validate no errors @@ -710,7 +713,7 @@ public void testSmallBatchSize() throws Exception { // Now tagLocation for these records, hbaseIndex should tag them correctly metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null @@ -733,7 +736,7 @@ public void testDelete() throws Exception { HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Test tagLocation without any entries in index - JavaRDD records1 = index.tagLocation(writeRecords, context(), hoodieTable); + JavaRDD records1 = tagLocation(index, writeRecords, hoodieTable); assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); // Insert records @@ -745,7 +748,7 @@ public void testDelete() throws Exception { // Now tagLocation for these records, hbaseIndex should tag them correctly metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List records2 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); + List records2 = tagLocation(index, writeRecords, hoodieTable).collect(); assertEquals(numRecords, records2.stream().filter(record -> record.isCurrentLocationKnown()).count()); assertEquals(numRecords, records2.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); assertEquals(numRecords, records2.stream().filter(record -> (record.getCurrentLocation() != null @@ -763,12 +766,12 @@ public void testDelete() throws Exception { // if not for this caching, due to RDD chaining/lineage, first time update is called again when subsequent update is called. // So caching here to break the chain and so future update does not re-trigger update of older Rdd. deleteWriteStatues.cache(); - JavaRDD deleteStatus = index.updateLocation(deleteWriteStatues, context(), hoodieTable); + JavaRDD deleteStatus = updateLocation(index, deleteWriteStatues, hoodieTable); assertEquals(deleteStatus.count(), deleteWriteStatues.count()); assertNoWriteErrors(deleteStatus.collect()); // Ensure no records can be tagged - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); assertEquals(0, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); assertEquals(0, records3.stream().filter(record -> (record.getCurrentLocation() != null diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java index bec7ee4a8b3a2..f8c1dfc87f79f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java @@ -18,26 +18,35 @@ package org.apache.hudi.client.functional; -import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.LockConfiguration; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.testutils.FileCreateUtils; @@ -45,16 +54,15 @@ import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieMetricsConfig; +import org.apache.hudi.config.HoodieLockConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieMetadataException; -import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.metadata.FileSystemBackedTableMetadata; -import org.apache.hudi.metadata.HoodieBackedTableMetadata; import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; import org.apache.hudi.metadata.HoodieMetadataMetrics; import org.apache.hudi.metadata.HoodieTableMetadata; @@ -62,258 +70,474 @@ import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; +import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.Time; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.stream.Collectors; +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static java.util.Collections.singletonList; +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.apache.hudi.common.model.WriteOperationType.DELETE; +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @Tag("functional") -public class TestHoodieBackedMetadata extends HoodieClientTestHarness { +public class TestHoodieBackedMetadata extends TestHoodieMetadataBase { private static final Logger LOG = LogManager.getLogger(TestHoodieBackedMetadata.class); - @TempDir - public java.nio.file.Path tempFolder; + public static List bootstrapAndTableOperationTestArgs() { + return asList( + Arguments.of(COPY_ON_WRITE, true), + Arguments.of(COPY_ON_WRITE, false), + Arguments.of(MERGE_ON_READ, true), + Arguments.of(MERGE_ON_READ, false) + ); + } - private String metadataTableBasePath; + /** + * Metadata Table bootstrap scenarios. + */ + @ParameterizedTest + @MethodSource("bootstrapAndTableOperationTestArgs") + public void testMetadataTableBootstrap(HoodieTableType tableType, boolean addRollback) throws Exception { + init(tableType, false); + // bootstrap with few commits + doPreBootstrapOperations(testTable); + + writeConfig = getWriteConfig(true, true); + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + doWriteInsertAndUpsert(testTable); + validateMetadata(testTable); + if (addRollback) { + // trigger an UPSERT that will be rolled back + doWriteOperationAndValidate(testTable, "0000003"); + + // rollback last commit + doRollbackAndValidate(testTable, "0000003", "0000004"); + } - private HoodieTableType tableType; + // trigger couple of upserts + doWriteOperation(testTable, "0000005"); + doWriteOperation(testTable, "0000006"); + validateMetadata(testTable, true); + } - public void init(HoodieTableType tableType) throws IOException { - this.tableType = tableType; - initPath(); - initSparkContexts("TestHoodieMetadata"); - initFileSystem(); - fs.mkdirs(new Path(basePath)); - initMetaClient(tableType); - initTestDataGenerator(); - metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + /** + * Only valid partition directories are added to the metadata. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testOnlyValidPartitionsAdded(HoodieTableType tableType) throws Exception { + // This test requires local file system + init(tableType, false); + // Create an empty directory which is not a partition directory (lacks partition metadata) + final String nonPartitionDirectory = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-nonpartition"; + Files.createDirectories(Paths.get(basePath, nonPartitionDirectory)); - } + // Three directories which are partitions but will be ignored due to filter + final String filterDirRegex = ".*-filterDir\\d|\\..*"; + final String filteredDirectoryOne = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir1"; + final String filteredDirectoryTwo = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir2"; + final String filteredDirectoryThree = ".backups"; - @AfterEach - public void clean() throws IOException { - cleanupResources(); + // Create some commits + testTable.withPartitionMetaFiles("p1", "p2", filteredDirectoryOne, filteredDirectoryTwo, filteredDirectoryThree) + .addCommit("0000001").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10) + .addCommit("0000002").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10, 10); + + writeConfig = getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build(); + testTable.doWriteOperation("0000003", UPSERT, emptyList(), asList("p1", "p2"), 1, true); + syncTableMetadata(writeConfig); + + List partitions = metadataWriter(writeConfig).metadata().getAllPartitionPaths(); + assertFalse(partitions.contains(nonPartitionDirectory), + "Must not contain the non-partition " + nonPartitionDirectory); + assertTrue(partitions.contains("p1"), "Must contain partition p1"); + assertTrue(partitions.contains("p2"), "Must contain partition p2"); + + assertFalse(partitions.contains(filteredDirectoryOne), + "Must not contain the filtered directory " + filteredDirectoryOne); + assertFalse(partitions.contains(filteredDirectoryTwo), + "Must not contain the filtered directory " + filteredDirectoryTwo); + assertFalse(partitions.contains(filteredDirectoryThree), + "Must not contain the filtered directory " + filteredDirectoryThree); + + FileStatus[] statuses = metadata(writeConfig, context).getAllFilesInPartition(new Path(basePath, "p1")); + assertEquals(tableType == COPY_ON_WRITE ? 3 : 4, statuses.length); + statuses = metadata(writeConfig, context).getAllFilesInPartition(new Path(basePath, "p2")); + assertEquals(tableType == COPY_ON_WRITE ? 6 : 7, statuses.length); + Map partitionsToFilesMap = metadata(writeConfig, context).getAllFilesInPartitions(asList(basePath + "/p1", basePath + "/p2")); + assertEquals(2, partitionsToFilesMap.size()); + assertEquals(tableType == COPY_ON_WRITE ? 3 : 4, partitionsToFilesMap.get(basePath + "/p1").length); + assertEquals(tableType == COPY_ON_WRITE ? 6 : 7, partitionsToFilesMap.get(basePath + "/p2").length); } /** - * Metadata Table bootstrap scenarios. + * Test various table operations sync to Metadata Table correctly. */ - @Test - public void testMetadataTableBootstrap() throws Exception { - init(HoodieTableType.COPY_ON_WRITE); - HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + @ParameterizedTest + @MethodSource("bootstrapAndTableOperationTestArgs") + public void testTableOperations(HoodieTableType tableType, boolean enableFullScan) throws Exception { + init(tableType, true, enableFullScan); + doWriteInsertAndUpsert(testTable); - // Metadata table should not exist until created for the first time - assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not exist"); - assertThrows(TableNotFoundException.class, () -> HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build()); - - // Metadata table is not created if disabled by config - String firstCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) { - client.startCommitWithTime(firstCommitTime); - client.insert(jsc.parallelize(dataGen.generateInserts(firstCommitTime, 5)), firstCommitTime); - assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not be created"); - assertThrows(TableNotFoundException.class, () -> HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build()); + // trigger an upsert + doWriteOperationAndValidate(testTable, "0000003"); + + // trigger compaction + if (MERGE_ON_READ.equals(tableType)) { + doCompactionAndValidate(testTable, "0000004"); } - // Metadata table should not be created if any non-complete instants are present - String secondCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, true), true)) { - client.startCommitWithTime(secondCommitTime); - client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime); - // AutoCommit is false so no bootstrap - client.syncTableMetadata(); - assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not be created"); - assertThrows(TableNotFoundException.class, () -> HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build()); - // rollback this commit - client.rollback(secondCommitTime); + // trigger an upsert + doWriteOperation(testTable, "0000005"); + + // trigger clean + doCleanAndValidate(testTable, "0000006", singletonList("0000001")); + + // trigger few upserts and validate + doWriteOperation(testTable, "0000007"); + doWriteOperation(testTable, "0000008"); + validateMetadata(testTable, emptyList(), true); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataInsertUpsertClean(HoodieTableType tableType) throws Exception { + init(tableType); + doWriteOperation(testTable, "0000001", INSERT); + doWriteOperation(testTable, "0000002"); + doCleanAndValidate(testTable, "0000003", Arrays.asList("0000001")); + if (tableType == MERGE_ON_READ) { + doCompaction(testTable, "0000004"); } + doWriteOperation(testTable, "0000005"); + validateMetadata(testTable, emptyList(), true); + } - // Metadata table created when enabled by config & sync is called - secondCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) { - client.startCommitWithTime(secondCommitTime); - client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime); - client.syncTableMetadata(); - assertTrue(fs.exists(new Path(metadataTableBasePath))); - validateMetadata(client); + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testInsertUpsertCluster(HoodieTableType tableType) throws Exception { + init(tableType); + doWriteOperation(testTable, "0000001", INSERT); + doWriteOperation(testTable, "0000002"); + doClusterAndValidate(testTable, "0000003"); + if (tableType == MERGE_ON_READ) { + doCompaction(testTable, "0000004"); } + validateMetadata(testTable, emptyList(), true); + } - // Delete all existing instants on dataset to simulate archiving. This should trigger a re-bootstrap of the metadata - // table as last synched instant has been "archived". - final String metadataTableMetaPath = metadataTableBasePath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME; - assertTrue(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(secondCommitTime)))); - - Arrays.stream(fs.listStatus(new Path(metaClient.getMetaPath()))).filter(status -> status.getPath().getName().matches("^\\d+\\..*")) - .forEach(status -> { - try { - fs.delete(status.getPath(), false); - } catch (IOException e) { - LOG.warn("Error when deleting instant " + status + ": " + e); - } - }); + /** + * Tests that table services in data table won't trigger table services in metadata table. + * @throws Exception + */ + @Test + public void testMetadataTableServices() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .enableFullScan(true) + .enableMetrics(false) + .withMaxNumDeltaCommitsBeforeCompaction(3) // after 3 delta commits for regular writer operations, compaction should kick in. + .build()).build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + + doWriteOperation(testTable, "0000001", INSERT); + doCleanAndValidate(testTable, "0000003", Arrays.asList("0000001")); + + HoodieTableMetadata tableMetadata = metadata(writeConfig, context); + // since clean was the last commit, table servives should not get triggered in metadata table. + assertFalse(tableMetadata.getLatestCompactionTime().isPresent()); + + doWriteOperation(testTable, "0000004", UPSERT); + // this should have triggered compaction in metadata table + tableMetadata = metadata(writeConfig, context); + assertTrue(tableMetadata.getLatestCompactionTime().isPresent()); + assertEquals(tableMetadata.getLatestCompactionTime().get(), "0000004001"); + } - String thirdCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) { - client.startCommitWithTime(thirdCommitTime); - client.insert(jsc.parallelize(dataGen.generateUpdates(thirdCommitTime, 2)), thirdCommitTime); - client.syncTableMetadata(); - assertTrue(fs.exists(new Path(metadataTableBasePath))); - validateMetadata(client); + /** + * Test rollback of various table operations sync to Metadata Table correctly. + */ + //@ParameterizedTest + //@EnumSource(HoodieTableType.class) + @Disabled + public void testRollbackOperations(HoodieTableType tableType) throws Exception { + init(tableType); + doWriteInsertAndUpsert(testTable); + + // trigger an upsert + doWriteOperationAndValidate(testTable, "0000003"); + + // trigger a commit and rollback + doWriteOperation(testTable, "0000004"); + doRollbackAndValidate(testTable, "0000004", "0000005"); + + // trigger few upserts and validate + for (int i = 6; i < 10; i++) { + doWriteOperation(testTable, "000000" + i); + } + validateMetadata(testTable); + + doWriteOperation(testTable, "0000010"); + + // rollback last commit. and validate. + doRollbackAndValidate(testTable, "0000010", "0000011"); - // Metadata Table should not have previous delta-commits as it was re-bootstrapped - assertFalse(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(firstCommitTime)))); - assertFalse(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(secondCommitTime)))); - assertTrue(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(thirdCommitTime)))); + // rollback of compaction + if (MERGE_ON_READ.equals(tableType)) { + doCompactionAndValidate(testTable, "0000012"); + doRollbackAndValidate(testTable, "0000012", "0000013"); } + + // roll back of delete + doWriteOperation(testTable, "0000014", DELETE); + doRollbackAndValidate(testTable, "0000014", "0000015"); + + // rollback partial commit + writeConfig = getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build(); + doWriteOperation(testTable, "0000016"); + testTable.doRollback("0000016", "0000017"); + validateMetadata(testTable); + + // marker-based rollback of partial commit + writeConfig = getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(true).build(); + doWriteOperation(testTable, "0000018"); + testTable.doRollback("0000018", "0000019"); + validateMetadata(testTable, true); } /** - * Test enable/disable sync via the config. + * Test that manual rollbacks work correctly and enough timeline history is maintained on the metadata table + * timeline. */ @Test - public void testSyncConfig() throws Exception { - init(HoodieTableType.COPY_ON_WRITE); - HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + public void testManualRollbacks() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + // Setting to archive more aggressively on the Metadata Table than the Dataset + final int maxDeltaCommitsBeforeCompaction = 4; + final int minArchiveCommitsMetadata = 2; + final int minArchiveCommitsDataset = 4; + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true) + .archiveCommitsWith(minArchiveCommitsMetadata, minArchiveCommitsMetadata + 1).retainCommits(1) + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(minArchiveCommitsDataset, minArchiveCommitsDataset + 1) + .retainCommits(1).retainFileVersions(1).withAutoClean(false).withAsyncClean(true).build()) + .build(); - // Create the metadata table - String firstCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) { - client.startCommitWithTime(firstCommitTime); - client.insert(jsc.parallelize(dataGen.generateInserts(firstCommitTime, 2)), firstCommitTime); - client.syncTableMetadata(); - assertTrue(fs.exists(new Path(metadataTableBasePath))); - validateMetadata(client); + initWriteConfigAndMetatableWriter(writeConfig, true); + doWriteInsertAndUpsert(testTable, "000001", "000002"); + + for (int i = 3; i < 10; i++) { + doWriteOperation(testTable, "00000" + i); + archiveDataTable(writeConfig, metaClient); } + validateMetadata(testTable); - // If sync is disabled, the table will not sync - HoodieWriteConfig config = getWriteConfigBuilder(true, true, false) - .withMetadataConfig(HoodieMetadataConfig.newBuilder() - .enable(true).enableMetrics(false).enableSync(false).build()).build(); - final String metadataTableMetaPath = metadataTableBasePath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME; - String secondCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, config, true)) { - client.startCommitWithTime(secondCommitTime); - client.insert(jsc.parallelize(dataGen.generateInserts(secondCommitTime, 2)), secondCommitTime); - client.syncTableMetadata(); - - // Metadata Table should not have synced - assertTrue(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(firstCommitTime)))); - assertFalse(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(secondCommitTime)))); + // We can only rollback those commits whose deltacommit have not been archived yet. + int numRollbacks = 0; + boolean exceptionRaised = false; + List allInstants = metaClient.reloadActiveTimeline().getCommitsTimeline().getReverseOrderedInstants().collect(Collectors.toList()); + for (HoodieInstant instantToRollback : allInstants) { + try { + testTable.doRollback(instantToRollback.getTimestamp(), String.valueOf(Time.now())); + validateMetadata(testTable); + ++numRollbacks; + } catch (HoodieMetadataException e) { + exceptionRaised = true; + break; + } } - // If sync is enabled, the table will sync - String thirdCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) { - client.startCommitWithTime(thirdCommitTime); - client.insert(jsc.parallelize(dataGen.generateInserts(thirdCommitTime, 2)), thirdCommitTime); - client.syncTableMetadata(); - - // Metadata Table should have synced - assertTrue(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(firstCommitTime)))); - assertTrue(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(secondCommitTime)))); - assertTrue(fs.exists(new Path(metadataTableMetaPath, HoodieTimeline.makeDeltaFileName(thirdCommitTime)))); + assertTrue(exceptionRaised, "Rollback of archived instants should fail"); + // Since each rollback also creates a deltacommit, we can only support rolling back of half of the original + // instants present before rollback started. + assertTrue(numRollbacks >= Math.max(minArchiveCommitsDataset, minArchiveCommitsMetadata) / 2, + "Rollbacks of non archived instants should work"); + } + + /** + * Test sync of table operations. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testSync(HoodieTableType tableType) throws Exception { + init(tableType, false); + // Initial commits without metadata table enabled + writeConfig = getWriteConfigBuilder(true, false, false).build(); + doPreBootstrapOperations(testTable, "00000001", "00000002"); + + // Enable metadata table so it initialized by listing from file system + writeConfig = getWriteConfigBuilder(true, true, false).build(); + + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + + doWriteOperation(testTable, "00000003", INSERT); + doWriteOperation(testTable, "00000004", UPSERT); + doWriteOperation(testTable, "00000005", UPSERT); + + // trigger compaction + if (MERGE_ON_READ.equals(tableType)) { + doCompactionAndValidate(testTable, "00000006"); } + + // trigger an upsert + doWriteOperation(testTable, "00000008"); + // trigger delete + doWriteOperation(testTable, "00000009", DELETE); + // trigger clean + doCleanAndValidate(testTable, "00000010", asList("00000003", "00000004")); + // trigger another upsert + doWriteOperation(testTable, "00000011"); + // trigger clustering + doClusterAndValidate(testTable, "00000012"); + + // If there is an inflight operation, the Metadata Table is not updated beyond that operations but the + // in-memory merge should consider all the completed operations. + HoodieCommitMetadata inflightCommitMeta = testTable.doWriteOperation("00000007", UPSERT, emptyList(), + asList("p1", "p2"), 2, false, true); + // trigger upsert + doWriteOperation(testTable, "00000013"); + // testTable validation will fetch only files pertaining to completed commits. So, validateMetadata() will skip files for 007 + // while validating against actual metadata table. + validateMetadata(testTable, singletonList("00000007")); + + // Remove the inflight instance holding back table sync + testTable.moveInflightCommitToComplete("00000007", inflightCommitMeta); + validateMetadata(testTable); + // A regular commit should get synced + doWriteOperation(testTable, "00000014"); + validateMetadata(testTable, emptyList(), true); } + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapLargeCommitList(HoodieTableType tableType) throws Exception { + init(tableType); + for (int i = 1; i < 25; i += 7) { + String commitTime1 = ((i > 9) ? ("00000") : ("000000")) + i; + String commitTime2 = ((i > 9) ? ("00000") : ("000000")) + (i + 1); + String commitTime3 = ((i > 9) ? ("00000") : ("000000")) + (i + 2); + String commitTime4 = ((i > 9) ? ("00000") : ("000000")) + (i + 3); + String commitTime5 = ((i > 9) ? ("00000") : ("000000")) + (i + 4); + String commitTime6 = ((i > 9) ? ("00000") : ("000000")) + (i + 5); + String commitTime7 = ((i > 9) ? ("00000") : ("000000")) + (i + 6); + doWriteOperation(testTable, commitTime1, INSERT); + doWriteOperation(testTable, commitTime2); + doClean(testTable, commitTime3, Arrays.asList(commitTime1)); + doWriteOperation(testTable, commitTime4); + if (tableType == MERGE_ON_READ) { + doCompaction(testTable, commitTime5); + } + doWriteOperation(testTable, commitTime6); + doRollback(testTable, commitTime6, commitTime7); + } + validateMetadata(testTable, emptyList(), true); + } + + // Some operations are not feasible with test table infra. hence using write client to test those cases. + /** - * Only valid partition directories are added to the metadata. + * Rollback of the first commit should not trigger bootstrap errors at the metadata table. */ - @Test - public void testOnlyValidPartitionsAdded() throws Exception { - // This test requires local file system - init(HoodieTableType.COPY_ON_WRITE); + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testFirstCommitRollback(HoodieTableType tableType) throws Exception { + init(tableType); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - // Create an empty directory which is not a partition directory (lacks partition metadata) - final String nonPartitionDirectory = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-nonpartition"; - Files.createDirectories(Paths.get(basePath, nonPartitionDirectory)); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, + getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build())) { - // Three directories which are partitions but will be ignored due to filter - final String filterDirRegex = ".*-filterDir\\d|\\..*"; - final String filteredDirectoryOne = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir1"; - final String filteredDirectoryTwo = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir2"; - final String filteredDirectoryThree = ".backups"; + // Write 1 + String commitTime = "0000001"; + List records = dataGen.generateInserts(commitTime, 20); + client.startCommitWithTime(commitTime); + List writeStatuses = client.insert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); - // Create some commits - HoodieTestTable testTable = HoodieTestTable.of(metaClient); - testTable.withPartitionMetaFiles("p1", "p2", filteredDirectoryOne, filteredDirectoryTwo, filteredDirectoryThree) - .addCommit("001").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10) - .addCommit("002").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10, 10); + // Rollback the first commit + client.rollback(commitTime); - final HoodieWriteConfig writeConfig = - getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { - client.startCommitWithTime("005"); - client.insert(jsc.emptyRDD(), "005"); - - List partitions = metadataWriter(client).metadata().getAllPartitionPaths(); - assertFalse(partitions.contains(nonPartitionDirectory), - "Must not contain the non-partition " + nonPartitionDirectory); - assertTrue(partitions.contains("p1"), "Must contain partition p1"); - assertTrue(partitions.contains("p2"), "Must contain partition p2"); - - assertFalse(partitions.contains(filteredDirectoryOne), - "Must not contain the filtered directory " + filteredDirectoryOne); - assertFalse(partitions.contains(filteredDirectoryTwo), - "Must not contain the filtered directory " + filteredDirectoryTwo); - assertFalse(partitions.contains(filteredDirectoryThree), - "Must not contain the filtered directory " + filteredDirectoryThree); - - FileStatus[] statuses = metadata(client).getAllFilesInPartition(new Path(basePath, "p1")); - assertEquals(2, statuses.length); - statuses = metadata(client).getAllFilesInPartition(new Path(basePath, "p2")); - assertEquals(5, statuses.length); - Map partitionsToFilesMap = metadata(client).getAllFilesInPartitions( - Arrays.asList(basePath + "/p1", basePath + "/p2")); - assertEquals(2, partitionsToFilesMap.size()); - assertEquals(2, partitionsToFilesMap.get(basePath + "/p1").length); - assertEquals(5, partitionsToFilesMap.get(basePath + "/p2").length); + // Write 2 + commitTime = "0000002"; + records = dataGen.generateInserts(commitTime, 10); + client.startCommitWithTime(commitTime); + writeStatuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); } } + /** - * Test various table operations sync to Metadata Table correctly. + * Test several table operations with restore. This test uses SparkRDDWriteClient. + * Once the restore support is ready in HoodieTestTable, then rewrite this test. */ @ParameterizedTest @EnumSource(HoodieTableType.class) - public void testTableOperations(HoodieTableType tableType) throws Exception { + public void testTableOperationsWithRestore(HoodieTableType tableType) throws Exception { init(tableType); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, + getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build())) { // Write 1 (Bulk insert) - String newCommitTime = "001"; + String newCommitTime = "0000001"; List records = dataGen.generateInserts(newCommitTime, 20); client.startCommitWithTime(newCommitTime); List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); @@ -321,7 +545,7 @@ public void testTableOperations(HoodieTableType tableType) throws Exception { validateMetadata(client); // Write 2 (inserts) - newCommitTime = "002"; + newCommitTime = "0000002"; client.startCommitWithTime(newCommitTime); validateMetadata(client); @@ -331,15 +555,14 @@ public void testTableOperations(HoodieTableType tableType) throws Exception { validateMetadata(client); // Write 3 (updates) - newCommitTime = "003"; + newCommitTime = "0000003"; client.startCommitWithTime(newCommitTime); records = dataGen.generateUniqueUpdates(newCommitTime, 10); writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); assertNoWriteErrors(writeStatuses); - validateMetadata(client); // Write 4 (updates and inserts) - newCommitTime = "004"; + newCommitTime = "0000004"; client.startCommitWithTime(newCommitTime); records = dataGen.generateUpdates(newCommitTime, 10); writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); @@ -348,545 +571,528 @@ public void testTableOperations(HoodieTableType tableType) throws Exception { // Compaction if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { - newCommitTime = "005"; + newCommitTime = "0000005"; client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); client.compact(newCommitTime); validateMetadata(client); } // Write 5 (updates and inserts) - newCommitTime = "006"; + newCommitTime = "0000006"; client.startCommitWithTime(newCommitTime); records = dataGen.generateUpdates(newCommitTime, 5); writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); assertNoWriteErrors(writeStatuses); - validateMetadata(client); // Compaction if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { - newCommitTime = "007"; + newCommitTime = "0000007"; client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); client.compact(newCommitTime); validateMetadata(client); } // Deletes - newCommitTime = "008"; + newCommitTime = "0000009"; records = dataGen.generateDeletes(newCommitTime, 10); JavaRDD deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey()); client.startCommitWithTime(newCommitTime); client.delete(deleteKeys, newCommitTime); - validateMetadata(client); // Clean - newCommitTime = "009"; + newCommitTime = "0000009"; client.clean(newCommitTime); validateMetadata(client); // Restore - client.restoreToInstant("006"); + client.restoreToInstant("0000006"); validateMetadata(client); } } /** - * Test rollback of various table operations sync to Metadata Table correctly. + * Test multi-writer on metadata table with optimistic concurrency. */ - @ParameterizedTest - @EnumSource(HoodieTableType.class) - public void testRollbackOperations(HoodieTableType tableType) throws Exception { - init(tableType); + @Test + public void testMetadataMultiWriter() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - // Write 1 (Bulk insert) - String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - List records = dataGen.generateInserts(newCommitTime, 20); - client.startCommitWithTime(newCommitTime); - List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "3"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "5000"); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(FileSystemBasedLockProviderTestClass.class).build()) + .withProperties(properties) + .build(); - // Write 2 (inserts) + Rollback of inserts - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - records = dataGen.generateInserts(newCommitTime, 20); - writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); - client.rollback(newCommitTime); - client.syncTableMetadata(); - validateMetadata(client); + ExecutorService executors = Executors.newFixedThreadPool(dataGen.getPartitionPaths().length); + // Create clients in advance + SparkRDDWriteClient[] writeClients = new SparkRDDWriteClient[dataGen.getPartitionPaths().length]; + for (int i = 0; i < dataGen.getPartitionPaths().length; i++) { + writeClients[i] = new SparkRDDWriteClient(engineContext, writeConfig); + } - // Write 3 (updates) + Rollback of updates - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUniqueUpdates(newCommitTime, 20); - writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); - client.rollback(newCommitTime); - client.syncTableMetadata(); - validateMetadata(client); + // Parallel commits for separate partitions + List futures = new LinkedList<>(); + for (int i = 0; i < dataGen.getPartitionPaths().length; ++i) { + final int index = i; + String newCommitTime = "000000" + (index + 1); + Future future = executors.submit(() -> { + List records = dataGen.generateInsertsForPartition(newCommitTime, 100, dataGen.getPartitionPaths()[index]); + SparkRDDWriteClient writeClient = writeClients[index]; + writeClient.startCommitWithTime(newCommitTime); + List writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + }); + futures.add(future); + } - // Rollback of updates and inserts - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUpdates(newCommitTime, 10); - writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); - client.rollback(newCommitTime); - client.syncTableMetadata(); - validateMetadata(client); + // Wait for all commits to complete + for (Future future : futures) { + future.get(); + } - // Rollback of Compaction - if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); - client.compact(newCommitTime); - validateMetadata(client); - } + // Ensure all commits were synced to the Metadata Table + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + assertEquals(metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().countInstants(), 4); + assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000001"))); + assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000002"))); + assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000003"))); - // Rollback of Deletes - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - records = dataGen.generateDeletes(newCommitTime, 10); - JavaRDD deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey()); - client.startCommitWithTime(newCommitTime); - writeStatuses = client.delete(deleteKeys, newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); - client.rollback(newCommitTime); - client.syncTableMetadata(); - validateMetadata(client); + // Compaction may occur if the commits completed in order + assertTrue(metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants() <= 1); - // Rollback of Clean - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.clean(newCommitTime); - validateMetadata(client); - client.rollback(newCommitTime); - client.syncTableMetadata(); - validateMetadata(client); - } + // Validation + validateMetadata(writeClients[0]); + } - // Rollback of partial commits - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, - getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(false).build())) { - // Write updates and inserts - String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 10); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - client.rollback(newCommitTime); - client.syncTableMetadata(); - validateMetadata(client); - } + /** + * Tests that when inline cleaning is enabled and with auto commit set to true, there is no double locking. + * bcoz, auto clean is triggered within post commit which is already happening within a lock. + * + * @throws Exception + */ + @Test + public void testMultiWriterForDoubleLocking() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - // Marker based rollback of partial commits - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, - getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(true).build())) { - // Write updates and inserts - String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 10); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - client.rollback(newCommitTime); - client.syncTableMetadata(); - validateMetadata(client); + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "3"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "5000"); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(true).retainCommits(4).build()) + .withAutoCommit(false) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(FileSystemBasedLockProviderTestClass.class).build()) + .withProperties(properties) + .build(); + + SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig); + String partitionPath = dataGen.getPartitionPaths()[0]; + for (int j = 0; j < 6; j++) { + String newCommitTime = "000000" + j; + List records = dataGen.generateInsertsForPartition(newCommitTime, 100, partitionPath); + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime); + writeClient.commit(newCommitTime, writeStatuses); } + + // Ensure all commits were synced to the Metadata Table + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + LOG.warn("total commits in metadata table " + metadataMetaClient.getActiveTimeline().getCommitsTimeline().countInstants()); + + // 6 commits and 2 cleaner commits. + assertEquals(metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().countInstants(), 8); + assertTrue(metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants() <= 1); + // Validation + validateMetadata(writeClient); } /** - * Test when syncing rollback to metadata if the commit being rolled back has not been synced that essentially a no-op occurs to metadata. - * Once explicit sync is called, metadata should match. + * Lets say clustering commit succeeded in metadata table, but failed before committing to datatable. + * Next time, when clustering kicks in, hudi will rollback pending clustering (in data table) and re-attempt the clustering with same + * instant time. So, this test ensures the 2nd attempt succeeds with metadata enabled. + * This is applicable to any table service where instant time is fixed. So, how many ever times the operation fails, re attempt will + * be made with same commit time. + * Tests uses clustering to test out the scenario. */ - @ParameterizedTest - @EnumSource(HoodieTableType.class) - public void testRollbackUnsyncedCommit(HoodieTableType tableType) throws Exception { + @Test + public void testReattemptOfFailedClusteringCommit() throws Exception { + tableType = HoodieTableType.COPY_ON_WRITE; init(tableType); - HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + context = new HoodieSparkEngineContext(jsc); + HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false); + SparkRDDWriteClient client = getHoodieWriteClient(config); + + // Write 1 (Bulk insert) + String newCommitTime = "0000001"; + List records = dataGen.generateInserts(newCommitTime, 20); + client.startCommitWithTime(newCommitTime); + List writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // Write 2 (inserts) + newCommitTime = "0000002"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 20); + writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringSortColumns("_row_key") + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build(); + + HoodieWriteConfig newWriteConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER) + .withAutoCommit(false) + .withClusteringConfig(clusteringConfig).build(); + + // trigger clustering + SparkRDDWriteClient newClient = getHoodieWriteClient(newWriteConfig); + String clusteringCommitTime = newClient.scheduleClustering(Option.empty()).get().toString(); + HoodieWriteMetadata> clusterMetadata = newClient.cluster(clusteringCommitTime, true); + + // collect replaceFileIds for validation later. + Set replacedFileIds = new HashSet<>(); + clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> + partitionFiles.getValue().stream().forEach(file -> + replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file)))); + + // trigger new write to mimic other writes succeeding before re-attempt. + newCommitTime = "0000003"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 20); + writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // manually remove clustering completed instant from .hoodie folder and to mimic succeeded clustering in metadata table, but failed in data table. + FileCreateUtils.deleteReplaceCommit(basePath, clusteringCommitTime); + HoodieWriteMetadata> updatedClusterMetadata = newClient.cluster(clusteringCommitTime, true); + + metaClient.reloadActiveTimeline(); + Set updatedReplacedFileIds = new HashSet<>(); + updatedClusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> + partitionFiles.getValue().stream().forEach(file -> + updatedReplacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file)))); + assertEquals(replacedFileIds, updatedReplacedFileIds); + validateMetadata(client); + } - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - // Initialize table with metadata - String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - List records = dataGen.generateInserts(newCommitTime, 20); - client.startCommitWithTime(newCommitTime); - List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); - } + /** + * Ensure that the reader only reads completed instants. + * + * @throws IOException + */ + @Test + public void testReader() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) { - // Commit with metadata disabled - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 10); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - client.rollback(newCommitTime); - } + List records; + List writeStatuses; + String[] commitTimestamps = {HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime(), + HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime()}; - try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true))) { - assertFalse(metadata(client).isInSync()); - client.syncTableMetadata(); - validateMetadata(client); - } + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { + for (int i = 0; i < commitTimestamps.length; ++i) { + records = dataGen.generateInserts(commitTimestamps[i], 5); + client.startCommitWithTime(commitTimestamps[i]); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamps[i]).collect(); + assertNoWriteErrors(writeStatuses); + } - // If an unsynced commit is automatically rolled back during next commit, the rollback commit gets a timestamp - // greater than than the new commit which is started. Ensure that in this case the rollback is not processed - // as the earlier failed commit would not have been committed. - // - // Dataset: C1 C2 C3.inflight[failed] C4 R5[rolls back C3] - // Metadata: C1.delta C2.delta - // - // When R5 completes, C3.xxx will be deleted. When C4 completes, C4 and R5 will be committed to Metadata Table in - // that order. R5 should be neglected as C3 was never committed to metadata table. - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, false), true)) { - // Metadata disabled and no auto-commit - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 10); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - // Not committed so left in inflight state - client.syncTableMetadata(); - assertTrue(metadata(client).isInSync()); - validateMetadata(client); - } + // Ensure we can see files from each commit + Set timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), commitTimestamps.length); + for (int i = 0; i < commitTimestamps.length; ++i) { + assertTrue(timelineTimestamps.contains(commitTimestamps[i])); + } - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true), true)) { - // Metadata enabled - // The previous commit will be rolled back automatically - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 10); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - assertTrue(metadata(client).isInSync()); - validateMetadata(client); - } + // mark each commit as incomplete and ensure files are not seen + for (int i = 0; i < commitTimestamps.length; ++i) { + FileCreateUtils.deleteCommit(basePath, commitTimestamps[i]); + timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), commitTimestamps.length - 1); + for (int j = 0; j < commitTimestamps.length; ++j) { + assertTrue(j == i || timelineTimestamps.contains(commitTimestamps[j])); + } + FileCreateUtils.createCommit(basePath, commitTimestamps[i]); + } - // In this scenario an async operations is started and completes around the same time of the failed commit. - // Rest of the reasoning is same as above test. - // C4.clean was an asynchronous clean started along with C3. The clean completed but C3 commit failed. - // - // Dataset: C1 C2 C3.inflight[failed] C4.clean C5 R6[rolls back C3] - // Metadata: C1.delta C2.delta - // - // When R6 completes, C3.xxx will be deleted. When C5 completes, C4, C5 and R6 will be committed to Metadata Table - // in that order. R6 should be neglected as C3 was never committed to metadata table. - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, false), true)) { - // Metadata disabled and no auto-commit - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 10); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - // Not committed so left in inflight state - client.clean(); - client.syncTableMetadata(); - assertTrue(metadata(client).isInSync()); - validateMetadata(client); - } + // Test multiple incomplete commits + FileCreateUtils.deleteCommit(basePath, commitTimestamps[0]); + FileCreateUtils.deleteCommit(basePath, commitTimestamps[2]); + timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), commitTimestamps.length - 2); + for (int j = 0; j < commitTimestamps.length; ++j) { + assertTrue(j == 0 || j == 2 || timelineTimestamps.contains(commitTimestamps[j])); + } - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true), true)) { - // Metadata enabled - // The previous commit will be rolled back automatically - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 10); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - assertTrue(metadata(client).isInSync()); - validateMetadata(client); + // Test no completed commits + for (int i = 0; i < commitTimestamps.length; ++i) { + FileCreateUtils.deleteCommit(basePath, commitTimestamps[i]); + } + timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), 0); } } /** - * Test that manual rollbacks work correctly and enough timeline history is maintained on the metadata table - * timeline. + * Instants on Metadata Table should be archived as per config but we always keep atlest the number of instants + * as on the dataset. + *

+ * Metadata Table should be automatically compacted as per config. */ - @ParameterizedTest - @EnumSource(HoodieTableType.class) - public void testManualRollbacks(HoodieTableType tableType) throws Exception { - init(tableType); + @Test + public void testCleaningArchivingAndCompaction() throws Exception { + init(HoodieTableType.COPY_ON_WRITE, false); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - // Setting to archive more aggressively on the Metadata Table than the Dataset - final int maxDeltaCommitsBeforeCompaction = 4; - final int minArchiveCommitsMetadata = 2; - final int minArchiveCommitsDataset = 4; + final int maxDeltaCommitsBeforeCompaction = 3; HoodieWriteConfig config = getWriteConfigBuilder(true, true, false) .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true) - .archiveCommitsWith(minArchiveCommitsMetadata, minArchiveCommitsMetadata + 1).retainCommits(1) + .archiveCommitsWith(40, 60).retainCommits(1) .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build()) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(minArchiveCommitsDataset, minArchiveCommitsDataset + 1) - .retainCommits(1).retainFileVersions(1).withAutoClean(false).withAsyncClean(true).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER) + .retainCommits(1).retainFileVersions(1).withAutoClean(true).withAsyncClean(false).build()) .build(); + List records; + String newCommitTime; try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, config)) { - // Initialize table with metadata - String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - List records = dataGen.generateInserts(newCommitTime, 20); - client.startCommitWithTime(newCommitTime); - List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); - - // Perform multiple commits - for (int i = 1; i < 10; ++i) { + // Some initial commits so compaction is not triggered. + // 1 deltacommit will be from bootstrap. So we can perform maxDeltaCommitsBeforeCompaction - 2 more commits before + // compaction will be attempted. + for (int i = 0; i < maxDeltaCommitsBeforeCompaction - 2; ++i) { newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - if (i == 1) { - records = dataGen.generateInserts(newCommitTime, 5); - } else { - records = dataGen.generateUpdates(newCommitTime, 2); - } + records = dataGen.generateInserts(newCommitTime, 5); client.startCommitWithTime(newCommitTime); - writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); } - // We can only rollback those commits whose deltacommit have not been archived yet. - int numRollbacks = 0; - boolean exceptionRaised = false; - - List allInstants = metaClient.reloadActiveTimeline().getCommitsTimeline().getReverseOrderedInstants() - .collect(Collectors.toList()); - for (HoodieInstant instantToRollback : allInstants) { - try { - client.rollback(instantToRollback.getTimestamp()); - client.syncTableMetadata(); - ++numRollbacks; - } catch (HoodieMetadataException e) { - exceptionRaised = true; - break; - } - } - - assertTrue(exceptionRaised, "Rollback of archived instants should fail"); - // Since each rollback also creates a deltacommit, we can only support rolling back of half of the original - // instants present before rollback started. - assertTrue(numRollbacks >= Math.max(minArchiveCommitsDataset, minArchiveCommitsMetadata) / 2, - "Rollbacks of non archived instants should work"); - } - } - - /** - * Test sync of table operations. - */ - @ParameterizedTest - @EnumSource(HoodieTableType.class) - @Disabled - public void testSync(HoodieTableType tableType) throws Exception { - init(tableType); - HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(config.getBasePath()).build(); - String newCommitTime; - List records; - List writeStatuses; + // There should not be any compaction yet and we have not performed more than maxDeltaCommitsBeforeCompaction + // deltacommits (1 will be due to bootstrap) + HoodieActiveTimeline metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 0); + assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction - 1); + assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0); - // Initial commits without metadata table enabled - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) { + // Next commit will initiate a compaction newCommitTime = HoodieActiveTimeline.createNewInstantTime(); records = dataGen.generateInserts(newCommitTime, 5); client.startCommitWithTime(newCommitTime); - writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1); + assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction + 1); + assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0); + + // More than maxDeltaCommitsBeforeCompaction commits + String inflightCommitTime = newCommitTime; + for (int i = 0; i < maxDeltaCommitsBeforeCompaction + 1; ++i) { + newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + records = dataGen.generateInserts(newCommitTime, 5); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + if (i == 0) { + // Mark this commit inflight so compactions dont take place + FileCreateUtils.deleteCommit(basePath, newCommitTime); + FileCreateUtils.createInflightCommit(basePath, newCommitTime); + inflightCommitTime = newCommitTime; + } + } + + // Ensure no more compactions took place due to the leftover inflight commit + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1); + assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), + ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction /* clean from dataset */) + 1)/* clean in metadata table */); + + // Complete commit + FileCreateUtils.createCommit(basePath, inflightCommitTime); + // Next commit should lead to compaction newCommitTime = HoodieActiveTimeline.createNewInstantTime(); records = dataGen.generateInserts(newCommitTime, 5); client.startCommitWithTime(newCommitTime); - writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - } + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Enable metadata table so it initialized by listing from file system - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - // inserts - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - records = dataGen.generateInserts(newCommitTime, 5); - writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); + // Ensure compactions took place + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 2); + assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), + ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction + 1 /* clean from dataset */) + 2 /* clean in metadata table */)); + assertTrue(datasetMetaClient.getArchivedTimeline().reload().countInstants() > 0); validateMetadata(client); - assertTrue(metadata(client).isInSync()); } + } - // Various table operations without metadata table enabled - String restoreToInstant; - String inflightActionTimestamp; - String beforeInflightActionTimestamp; - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) { - // updates - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUniqueUpdates(newCommitTime, 5); - writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - assertTrue(metadata(client).isInSync()); - - // updates and inserts - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUpdates(newCommitTime, 10); - writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - assertTrue(metadata(client).isInSync()); - - // Compaction - if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); - client.compact(newCommitTime); - assertTrue(metadata(client).isInSync()); - } - - // Savepoint - restoreToInstant = newCommitTime; - if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { - client.savepoint("hoodie", "metadata test"); - assertTrue(metadata(client).isInSync()); - } + @Test + public void testUpgradeDowngrade() throws IOException { + init(HoodieTableType.COPY_ON_WRITE, false); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - // Record a timestamp for creating an inflight instance for sync testing - inflightActionTimestamp = HoodieActiveTimeline.createNewInstantTime(); - beforeInflightActionTimestamp = newCommitTime; + // Perform a commit. This should bootstrap the metadata table with latest version. + List records; + List writeStatuses; + String commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + HoodieWriteConfig writeConfig = getWriteConfig(true, true); - // Deletes - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - records = dataGen.generateDeletes(newCommitTime, 5); - JavaRDD deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey()); - client.startCommitWithTime(newCommitTime); - client.delete(deleteKeys, newCommitTime); - assertTrue(metadata(client).isInSync()); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect(); + assertNoWriteErrors(writeStatuses); + } - // Clean - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.clean(newCommitTime); - assertTrue(metadata(client).isInSync()); + // Metadata table should have been bootstrapped + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus oldStatus = fs.getFileStatus(new Path(metadataTableBasePath)); - // updates - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUniqueUpdates(newCommitTime, 10); - writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - assertTrue(metadata(client).isInSync()); + // set hoodie.table.version to 2 in hoodie.properties file + changeTableVersion(HoodieTableVersion.TWO); - // insert overwrite to test replacecommit - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION); - records = dataGen.generateInserts(newCommitTime, 5); - HoodieWriteResult replaceResult = client.insertOverwrite(jsc.parallelize(records, 1), newCommitTime); - writeStatuses = replaceResult.getWriteStatuses().collect(); + // With next commit the table should be deleted (as part of upgrade) + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + metaClient.reloadActiveTimeline(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect(); assertNoWriteErrors(writeStatuses); - assertTrue(metadata(client).isInSync()); } + assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not exist"); - // If there is an incomplete operation, the Metadata Table is not updated beyond that operations but the - // in-memory merge should consider all the completed operations. - Path inflightCleanPath = new Path(metaClient.getMetaPath(), HoodieTimeline.makeInflightCleanerFileName(inflightActionTimestamp)); - fs.create(inflightCleanPath).close(); - + // With next commit the table should be re-bootstrapped (currently in the constructor. To be changed) + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - // Restore cannot be done until the metadata table is in sync. See HUDI-1502 for details - client.syncTableMetadata(); - - // Table should sync only before the inflightActionTimestamp - HoodieBackedTableMetadataWriter writer = - (HoodieBackedTableMetadataWriter) SparkHoodieBackedTableMetadataWriter.create(hadoopConf, client.getConfig(), context); - assertEquals(writer.getMetadataReader().getUpdateTime().get(), beforeInflightActionTimestamp); - - // Reader should sync to all the completed instants - HoodieTableMetadata metadata = HoodieTableMetadata.create(context, client.getConfig().getMetadataConfig(), - client.getConfig().getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue()); - assertEquals(((HoodieBackedTableMetadata)metadata).getReaderTime().get(), newCommitTime); - - // Remove the inflight instance holding back table sync - fs.delete(inflightCleanPath, false); - client.syncTableMetadata(); - - writer = - (HoodieBackedTableMetadataWriter)SparkHoodieBackedTableMetadataWriter.create(hadoopConf, client.getConfig(), context); - assertEquals(writer.getMetadataReader().getUpdateTime().get(), newCommitTime); - - // Reader should sync to all the completed instants - metadata = HoodieTableMetadata.create(context, client.getConfig().getMetadataConfig(), - client.getConfig().getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue()); - assertEquals(writer.getMetadataReader().getUpdateTime().get(), newCommitTime); + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect(); + assertNoWriteErrors(writeStatuses); } - // Enable metadata table and ensure it is synced - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - client.restoreToInstant(restoreToInstant); - assertFalse(metadata(client).isInSync()); + initMetaClient(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.THREE.versionCode()); + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime()); - newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - client.startCommitWithTime(newCommitTime); - client.syncTableMetadata(); + // Test downgrade by running the downgrader + new UpgradeDowngrade(metaClient, writeConfig, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.TWO, null); - validateMetadata(client); - assertTrue(metadata(client).isInSync()); - } + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.TWO.versionCode()); + assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not exist"); } /** - * Instants on Metadata Table should be archived as per config but we always keep atlest the number of instants - * as on the dataset. Metadata Table should be automatically compacted as per config. + * When table needs to be upgraded and when multi writer is enabled, hudi rollsback partial commits. Upgrade itself is happening + * within a lock and hence rollback should not lock again. + * + * @throws IOException + * @throws InterruptedException */ @Test - public void testCleaningArchivingAndCompaction() throws Exception { - init(HoodieTableType.COPY_ON_WRITE); + public void testRollbackDuringUpgradeForDoubleLocking() throws IOException, InterruptedException { + init(HoodieTableType.COPY_ON_WRITE, false); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - final int maxDeltaCommitsBeforeCompaction = 4; - final int minArchiveLimit = 4; - final int maxArchiveLimit = 6; - HoodieWriteConfig config = getWriteConfigBuilder(true, true, false) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true) - .archiveCommitsWith(minArchiveLimit - 2, maxArchiveLimit - 2).retainCommits(1) - .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build()) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(minArchiveLimit, maxArchiveLimit) - .retainCommits(1).retainFileVersions(1).withAutoClean(true).withAsyncClean(true).build()) + // Perform a commit. This should bootstrap the metadata table with latest version. + List records; + JavaRDD writeStatuses; + String commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "3"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "5000"); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(false, true, false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(FileSystemBasedLockProviderTestClass.class).build()) + .withProperties(properties) .build(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp); + client.commit(commitTimestamp, writeStatuses); + } - List records; - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, config)) { - for (int i = 1; i < 10; ++i) { - String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); - if (i == 1) { - records = dataGen.generateInserts(newCommitTime, 5); - } else { - records = dataGen.generateUpdates(newCommitTime, 2); - } - client.startCommitWithTime(newCommitTime); - List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - validateMetadata(client); - } + // Metadata table should have been bootstrapped + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus oldStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + + // trigger partial commit + metaClient.reloadActiveTimeline(); + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp); } - HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); - HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(config.getBasePath()).build(); - HoodieActiveTimeline metadataTimeline = metadataMetaClient.getActiveTimeline(); - // check that there are compactions. - assertTrue(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants() > 0); - // check that cleaning has, once after each compaction. - assertTrue(metadataTimeline.getCleanerTimeline().filterCompletedInstants().countInstants() > 0); - // ensure archiving has happened - long numDataCompletedInstants = datasetMetaClient.getActiveTimeline().filterCompletedInstants().countInstants(); - long numDeltaCommits = metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(); - assertTrue(numDeltaCommits >= minArchiveLimit); - assertTrue(numDeltaCommits < numDataCompletedInstants, "Must have less delta commits than total completed instants on data timeline."); + // set hoodie.table.version to 2 in hoodie.properties file + changeTableVersion(HoodieTableVersion.TWO); + writeConfig = getWriteConfigBuilder(true, true, false) + .withRollbackUsingMarkers(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(FileSystemBasedLockProviderTestClass.class).build()) + .withProperties(properties) + .build(); + + // With next commit the table should be deleted (as part of upgrade) and partial commit should be rolled back. + metaClient.reloadActiveTimeline(); + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp); + assertNoWriteErrors(writeStatuses.collect()); + } + assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not exist"); + + // With next commit the table should be re-bootstrapped (currently in the constructor. To be changed) + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp); + assertNoWriteErrors(writeStatuses.collect()); + } + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + + initMetaClient(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.THREE.versionCode()); + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime()); } /** * Test various error scenarios. */ - @Test + //@Test + @Disabled public void testErrorCases() throws Exception { init(HoodieTableType.COPY_ON_WRITE); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); @@ -926,18 +1132,15 @@ public void testErrorCases() throws Exception { } } - /** - * Test non-partitioned datasets. - */ - //@Test + @Test public void testNonPartitioned() throws Exception { - init(HoodieTableType.COPY_ON_WRITE); + init(HoodieTableType.COPY_ON_WRITE, false); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); HoodieTestDataGenerator nonPartitionedGenerator = new HoodieTestDataGenerator(new String[] {""}); try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { // Write 1 (Bulk insert) - String newCommitTime = "001"; + String newCommitTime = "0000001"; List records = nonPartitionedGenerator.generateInserts(newCommitTime, 10); client.startCommitWithTime(newCommitTime); List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); @@ -953,7 +1156,7 @@ public void testNonPartitioned() throws Exception { */ @Test public void testMetadataMetrics() throws Exception { - init(HoodieTableType.COPY_ON_WRITE); + init(HoodieTableType.COPY_ON_WRITE, false); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true).build())) { @@ -969,148 +1172,64 @@ public void testMetadataMetrics() throws Exception { assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".count")); assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".totalDuration")); assertTrue(metricsRegistry.getAllCounts().get(HoodieMetadataMetrics.INITIALIZE_STR + ".count") >= 1L); - assertTrue(metricsRegistry.getAllCounts().containsKey("basefile.size")); - assertTrue(metricsRegistry.getAllCounts().containsKey("logfile.size")); - assertTrue(metricsRegistry.getAllCounts().containsKey("basefile.count")); - assertTrue(metricsRegistry.getAllCounts().containsKey("logfile.count")); + final String prefix = MetadataPartitionType.FILES.partitionPath() + "."; + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_BASE_FILES)); + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_LOG_FILES)); + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE)); + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE)); } } - /** - * Test when reading from metadata table which is out of sync with dataset that results are still consistent. - */ - @Test - public void testMetadataOutOfSync() throws Exception { - init(HoodieTableType.COPY_ON_WRITE); - HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - - SparkRDDWriteClient unsyncedClient = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true)); - - // Enable metadata so table is initialized - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - // Perform Bulk Insert - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateInserts(newCommitTime, 20); - client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); - } - - // Perform commit operations with metadata disabled - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) { - // Perform Insert - String newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateInserts(newCommitTime, 20); - client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); - - // Perform Upsert - newCommitTime = "003"; - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUniqueUpdates(newCommitTime, 20); - client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - - // Compaction - if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { - newCommitTime = "004"; - client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); - client.compact(newCommitTime); - } - } - - assertFalse(metadata(unsyncedClient).isInSync()); - validateMetadata(unsyncedClient); - - // Perform clean operation with metadata disabled - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) { - // One more commit needed to trigger clean so upsert and compact - String newCommitTime = "005"; - client.startCommitWithTime(newCommitTime); - List records = dataGen.generateUpdates(newCommitTime, 20); - client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - - if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { - newCommitTime = "006"; - client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); - client.compact(newCommitTime); - } - - // Clean - newCommitTime = "007"; - client.clean(newCommitTime); - } - - assertFalse(metadata(unsyncedClient).isInSync()); - validateMetadata(unsyncedClient); - - // Perform restore with metadata disabled - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) { - client.restoreToInstant("004"); - } - - assertFalse(metadata(unsyncedClient).isInSync()); - validateMetadata(unsyncedClient); + private void doPreBootstrapOperations(HoodieTestTable testTable) throws Exception { + doPreBootstrapOperations(testTable, "0000001", "0000002"); } - /** - * Test that failure to perform deltacommit on the metadata table does not lead to missed sync. - */ - @Test - public void testMetdataTableCommitFailure() throws Exception { - init(HoodieTableType.COPY_ON_WRITE); - HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - // Write 1 - String newCommitTime = "001"; - List records = dataGen.generateInserts(newCommitTime, 20); - client.startCommitWithTime(newCommitTime); - List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - - // Write 2 - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - records = dataGen.generateInserts(newCommitTime, 20); - writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - } + private void doPreBootstrapOperations(HoodieTestTable testTable, String commit1, String commit2) throws Exception { + testTable.doWriteOperation(commit1, INSERT, asList("p1", "p2"), asList("p1", "p2"), + 2, true); + testTable.doWriteOperation(commit2, UPSERT, asList("p1", "p2"), + 2, true); + validateMetadata(testTable); + } - // At this time both commits 001 and 002 must be synced to the metadata table - HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); - HoodieActiveTimeline timeline = metadataMetaClient.getActiveTimeline(); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "001"))); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "002"))); - - // Delete the 002 deltacommit completed instant to make it inflight - FileCreateUtils.deleteDeltaCommit(metadataTableBasePath, "002"); - timeline = metadataMetaClient.reloadActiveTimeline(); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "001"))); - assertTrue(timeline.containsInstant(new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "002"))); - - // In this commit deltacommit "002" will be rolled back and attempted again. - String latestCommitTime = HoodieActiveTimeline.createNewInstantTime(); - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { - String newCommitTime = "003"; - List records = dataGen.generateInserts(newCommitTime, 20); - client.startCommitWithTime(newCommitTime); - client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); + private void doWriteInsertAndUpsert(HoodieTestTable testTable) throws Exception { + doWriteInsertAndUpsert(testTable, "0000001", "0000002"); + } - records = dataGen.generateInserts(latestCommitTime, 20); - client.startCommitWithTime(latestCommitTime); - List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), latestCommitTime).collect(); - assertNoWriteErrors(writeStatuses); - } + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts) { + HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER); + return builder + .withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .compactionSmallFileSize(smallFileSize) + // Set rollback to LAZY so no inflights are deleted + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .insertSplitSize(insertSplitSize).build()) + .withStorageConfig( + HoodieStorageConfig.newBuilder() + .hfileMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)) + .parquetMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)).build()) + .withMergeAllowDuplicateOnInserts(mergeAllowDuplicateInserts) + .build(); + } - timeline = metadataMetaClient.reloadActiveTimeline(); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "001"))); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "002"))); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, latestCommitTime))); - assertTrue(timeline.getRollbackTimeline().countInstants() == 1); + public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr, HoodieIndex.IndexType indexType, + HoodieFailedWritesCleaningPolicy cleaningPolicy) { + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr) + .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withWriteStatusClass(MetadataMergeWriteStatus.class) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(cleaningPolicy) + .compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).orcMaxFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()) + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withEnableBackupForRemoteFileSystemView(false) // Fail test if problem connecting to timeline-server + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); } - /** - * Validate the metadata tables contents to ensure it matches what is on the file system. - */ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException { HoodieWriteConfig config = testClient.getConfig(); @@ -1169,7 +1288,7 @@ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException Collections.sort(fsFileNames); Collections.sort(metadataFilenames); - assertEquals(fsStatuses.length, partitionToFilesMap.get(basePath + "/" + partition).length); + assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length); // File sizes should be valid Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0)); @@ -1214,7 +1333,6 @@ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException .sum(); assertEquals(metadataFilenames.size(), numFiles); } catch (IOException e) { - // TODO Auto-generated catch block e.printStackTrace(); assertTrue(false, "Exception should not be raised: " + e); } @@ -1226,10 +1344,8 @@ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException // Validate write config for metadata table HoodieWriteConfig metadataWriteConfig = metadataWriter.getWriteConfig(); assertFalse(metadataWriteConfig.isMetadataTableEnabled(), "No metadata table for metadata table"); - assertFalse(metadataWriteConfig.getFileListingMetadataVerify(), "No verify for metadata table"); // Metadata table should be in sync with the dataset - assertTrue(metadata(client).isInSync()); HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); // Metadata table is MOR @@ -1243,8 +1359,8 @@ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory // in the .hoodie folder. List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), - false, false, false); - Assertions.assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size()); + false, false); + assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size()); // Metadata table should automatically compact and clean // versions are +1 as autoclean / compaction happens end of commits @@ -1261,6 +1377,23 @@ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException LOG.info("Validation time=" + timer.endTimer()); } + /** + * Returns the list of all files in the dataset by iterating over the metadata table. + * + * @throws IOException + * @throws IllegalArgumentException + */ + private List getAllFiles(HoodieTableMetadata metadata) throws Exception { + List allfiles = new LinkedList<>(); + for (String partition : metadata.getAllPartitionPaths()) { + for (FileStatus status : metadata.getAllFilesInPartition(new Path(basePath, partition))) { + allfiles.add(status.getPath()); + } + } + + return allfiles; + } + private HoodieBackedTableMetadataWriter metadataWriter(SparkRDDWriteClient client) { return (HoodieBackedTableMetadataWriter) SparkHoodieBackedTableMetadataWriter .create(hadoopConf, client.getConfig(), new HoodieSparkEngineContext(jsc)); @@ -1272,42 +1405,14 @@ private HoodieTableMetadata metadata(SparkRDDWriteClient client) { clientConfig.getSpillableMapBasePath()); } - // TODO: this can be moved to TestHarness after merge from master - private void assertNoWriteErrors(List statuses) { - // Verify there are no errors - for (WriteStatus status : statuses) { - assertFalse(status.hasErrors(), "Errors found in write of " + status.getFileId()); + private void changeTableVersion(HoodieTableVersion version) throws IOException { + metaClient.getTableConfig().setTableVersion(version); + Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); + try (FSDataOutputStream os = metaClient.getFs().create(propertyFile)) { + metaClient.getTableConfig().getProps().store(os, ""); } } - private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata) { - return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false).build(); - } - - private HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) { - return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics); - } - - private HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) { - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2) - .withAutoCommit(autoCommit) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) - .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1) - .withFailedWritesCleaningPolicy(policy) - .withAutoClean(false).retainCommits(1).retainFileVersions(1).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).build()) - .withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table") - .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() - .withEnableBackupForRemoteFileSystemView(false).build()) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder() - .enable(useFileListingMetadata) - .enableMetrics(enableMetrics).build()) - .withMetricsConfig(HoodieMetricsConfig.newBuilder().on(enableMetrics) - .withExecutorMetrics(true).usePrefix("unit-test").build()); - } - @Override protected HoodieTableType getTableType() { return tableType; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java new file mode 100644 index 0000000000000..5242e9f33766d --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.metadata.HoodieBackedTableMetadata; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +public class TestHoodieBackedTableMetadata extends TestHoodieMetadataBase { + + private static final Logger LOG = LogManager.getLogger(TestHoodieBackedTableMetadata.class); + + @Test + public void testTableOperations() throws Exception { + HoodieTableType tableType = HoodieTableType.COPY_ON_WRITE; + init(tableType); + doWriteInsertAndUpsert(testTable); + + // trigger an upsert + doWriteOperation(testTable, "0000003"); + verifyBaseMetadataTable(); + } + + private void doWriteInsertAndUpsert(HoodieTestTable testTable) throws Exception { + doWriteInsertAndUpsert(testTable, "0000001", "0000002"); + } + + private void verifyBaseMetadataTable() throws IOException { + HoodieBackedTableMetadata tableMetadata = new HoodieBackedTableMetadata(context, writeConfig.getMetadataConfig(), writeConfig.getBasePath(), writeConfig.getSpillableMapBasePath(), false); + assertTrue(tableMetadata.enabled()); + List fsPartitionPaths = testTable.getAllPartitionPaths(); + List fsPartitions = new ArrayList<>(); + fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString())); + List metadataPartitions = tableMetadata.getAllPartitionPaths(); + + Collections.sort(fsPartitions); + Collections.sort(metadataPartitions); + + assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match"); + assertEquals(fsPartitions, metadataPartitions, "Partitions should match"); + + // Files within each partition should match + HoodieTable table = HoodieSparkTable.create(writeConfig, context, true); + TableFileSystemView tableView = table.getHoodieView(); + List fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList()); + Map partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths); + assertEquals(fsPartitions.size(), partitionToFilesMap.size()); + + fsPartitions.forEach(partition -> { + try { + validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition); + } catch (IOException e) { + fail("Exception should not be raised: " + e); + } + }); + } + +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java index de4f42177429a..bff9724b594f9 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java @@ -710,6 +710,53 @@ private void testHoodieConcatHandle(HoodieWriteConfig config, boolean isPrepped) 2, false, config.populateMetaFields()); } + /** + * Test Insert API for HoodieConcatHandle when incoming entries contain duplicate keys. + */ + @Test + public void testInsertsWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(); + testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), false); + } + + /** + * Test InsertPrepped API for HoodieConcatHandle when incoming entries contain duplicate keys. + */ + @Test + public void testInsertsPreppedWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(); + testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), true); + } + + private void testHoodieConcatHandleOnDupInserts(HoodieWriteConfig config, boolean isPrepped) throws Exception { + HoodieWriteConfig hoodieWriteConfig = getConfigBuilder() + .withProps(config.getProps()) + .withMergeAllowDuplicateOnInserts(true) + .build(); + + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); + + // Write 1 (only inserts) + String initCommitTime = "000"; + String newCommitTime = "001"; + int firstInsertRecords = 50; + insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, firstInsertRecords, SparkRDDWriteClient::insert, + isPrepped, true, firstInsertRecords, config.populateMetaFields()); + + // Write 2 (updates with duplicates) + String prevCommitTime = newCommitTime; + newCommitTime = "004"; + int secondInsertRecords = 100; // needs to be larger than firstInsertRecords to guarantee duplicate keys + List commitTimesBetweenPrevAndNew = Arrays.asList("002", "003"); + + final Function2, String, Integer> recordGenFunction = + generateWrapRecordsFn(isPrepped, hoodieWriteConfig, dataGen::generateUpdates); + + writeBatch(client, newCommitTime, prevCommitTime, Option.of(commitTimesBetweenPrevAndNew), initCommitTime, + secondInsertRecords, recordGenFunction, SparkRDDWriteClient::insert, true, secondInsertRecords, + firstInsertRecords + secondInsertRecords, 2, false, config.populateMetaFields()); + } + /** * Tests deletion of records. */ @@ -2147,19 +2194,18 @@ public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFi assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 0); } - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testParallelInsertAndCleanPreviousFailedCommits(boolean populateMetaFields) throws Exception { + @Test + public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception { HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY; ExecutorService service = Executors.newFixedThreadPool(2); HoodieTestUtils.init(hadoopConf, basePath); // Perform 2 failed writes to table - SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)); writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false); client.close(); - client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)); writeBatch(client, "200", "200", Option.of(Arrays.asList("200")), "200", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false); @@ -2167,7 +2213,7 @@ public void testParallelInsertAndCleanPreviousFailedCommits(boolean populateMeta // refresh data generator to delete records generated from failed commits dataGen = new HoodieTestDataGenerator(); // Create a succesful commit - Future> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)), + Future> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "300", "200", Option.of(Arrays.asList("300")), "200", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true)); commit3.get(); @@ -2177,17 +2223,17 @@ public void testParallelInsertAndCleanPreviousFailedCommits(boolean populateMeta CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 0); assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2); assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 1); - client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)); // Await till enough time passes such that the first 2 failed commits heartbeats are expired boolean conditionMet = false; while (!conditionMet) { conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200"); Thread.sleep(2000); } - Future> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)), + Future> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true)); - Future clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)).clean()); + Future clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)).clean()); commit4.get(); clean1.get(); HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java index 9c4059a519504..5c75bb565a31d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java @@ -19,6 +19,7 @@ package org.apache.hudi.client.functional; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; import org.apache.hudi.common.model.HoodieKey; @@ -97,6 +98,10 @@ private static Stream indexTypeParams() { private HoodieWriteConfig config; private void setUp(IndexType indexType, boolean populateMetaFields) throws Exception { + setUp(indexType, populateMetaFields, true); + } + + private void setUp(IndexType indexType, boolean populateMetaFields, boolean enableMetadata) throws Exception { this.indexType = indexType; initPath(); initSparkContexts(); @@ -107,7 +112,7 @@ private void setUp(IndexType indexType, boolean populateMetaFields) throws Excep config = getConfigBuilder() .withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType) - .build()).withAutoCommit(false).build(); + .build()).withAutoCommit(false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata).build()).build(); writeClient = getHoodieWriteClient(config); this.index = writeClient.getIndex(); } @@ -130,7 +135,7 @@ public void testSimpleTagLocationAndUpdate(IndexType indexType, boolean populate HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Test tagLocation without any entries in index - JavaRDD javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); + JavaRDD javaRDD = tagLocation(index, writeRecords, hoodieTable); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); // Insert totalRecords records @@ -140,14 +145,14 @@ public void testSimpleTagLocationAndUpdate(IndexType indexType, boolean populate // Now tagLocation for these records, index should not tag them since it was a failed // commit - javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); + javaRDD = tagLocation(index, writeRecords, hoodieTable); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); // Now commit this & update location of records inserted and validate no errors writeClient.commit(newCommitTime, writeStatues); // Now tagLocation for these records, index should tag them correctly metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); + javaRDD = tagLocation(index, writeRecords, hoodieTable); Map recordKeyToPartitionPathMap = new HashMap(); List hoodieRecords = writeRecords.collect(); hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); @@ -180,7 +185,7 @@ public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean popul writeClient.startCommitWithTime(newCommitTime); JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - JavaRDD javaRDD1 = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); + JavaRDD javaRDD1 = tagLocation(index, writeRecords, hoodieTable); // Duplicate upsert and ensure correctness is maintained // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not @@ -196,7 +201,7 @@ public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean popul // Now tagLocation for these records, hbaseIndex should tag them correctly metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); + JavaRDD javaRDD = tagLocation(index, writeRecords, hoodieTable); Map recordKeyToPartitionPathMap = new HashMap(); List hoodieRecords = writeRecords.collect(); @@ -220,7 +225,7 @@ public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean popul @ParameterizedTest @MethodSource("indexTypeParams") public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, boolean populateMetaFields) throws Exception { - setUp(indexType, populateMetaFields); + setUp(indexType, populateMetaFields, false); String newCommitTime = writeClient.startCommit(); int totalRecords = 20 + random.nextInt(20); List records = dataGen.generateInserts(newCommitTime, totalRecords); @@ -236,7 +241,7 @@ public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, bool HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Now tagLocation for these records, hbaseIndex should tag them - JavaRDD javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); + JavaRDD javaRDD = tagLocation(index, writeRecords, hoodieTable); assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == totalRecords); // check tagged records are tagged with correct fileIds @@ -264,7 +269,7 @@ public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, bool hoodieTable = HoodieSparkTable.create(config, context, metaClient); // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled // back commit - javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); + javaRDD = tagLocation(index, writeRecords, hoodieTable); assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0); assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0); } @@ -307,7 +312,7 @@ public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD taggedRecordRDD = (JavaRDD) index.tagLocation(recordRDD, context, hoodieTable); + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); // Should not find any files for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -324,7 +329,7 @@ public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - taggedRecordRDD = (JavaRDD) index.tagLocation(recordRDD, context, hoodieTable); + taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); // Check results for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -367,7 +372,8 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() thro .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType) .withGlobalSimpleIndexUpdatePartitionPath(true) .withBloomIndexUpdatePartitionPath(true) - .build()).build(); + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()).build(); writeClient = getHoodieWriteClient(config); index = writeClient.getIndex(); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); @@ -413,7 +419,7 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() thro // test against incoming record with a different partition JavaRDD recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord)); - JavaRDD taggedRecordRDD = (JavaRDD) index.tagLocation(recordRDD, context, hoodieTable); + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); assertEquals(2, taggedRecordRDD.count()); for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -434,7 +440,7 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() thro // test against incoming record with the same partition JavaRDD recordRDDSamePartition = jsc .parallelize(Collections.singletonList(incomingRecordSamePartition)); - JavaRDD taggedRecordRDDSamePartition = (JavaRDD) index.tagLocation(recordRDDSamePartition, context, hoodieTable); + JavaRDD taggedRecordRDDSamePartition = tagLocation(index, recordRDDSamePartition, hoodieTable); assertEquals(1, taggedRecordRDDSamePartition.count()); HoodieRecord record = taggedRecordRDDSamePartition.first(); @@ -457,8 +463,8 @@ private HoodieWriteConfig.Builder getConfigBuilder() { } private JavaPairRDD>> getRecordLocations(JavaRDD keyRDD, HoodieTable hoodieTable) { - JavaRDD recordRDD = (JavaRDD) index.tagLocation( - keyRDD.map(k -> new HoodieRecord(k, new EmptyHoodieRecordPayload())), context, hoodieTable); + JavaRDD recordRDD = tagLocation( + index, keyRDD.map(k -> new HoodieRecord(k, new EmptyHoodieRecordPayload())), hoodieTable); return recordRDD.mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) : Option.empty()) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java new file mode 100644 index 0000000000000..cf261cc8994a1 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsConfig; +import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.HoodieTimelineArchiveLog; +import org.apache.hudi.testutils.HoodieClientTestHarness; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterEach; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Properties; + +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; + +public class TestHoodieMetadataBase extends HoodieClientTestHarness { + + private static final Logger LOG = LogManager.getLogger(TestHoodieMetadataBase.class); + + protected static HoodieTestTable testTable; + protected String metadataTableBasePath; + protected HoodieTableType tableType; + protected HoodieWriteConfig writeConfig; + protected HoodieTableMetadataWriter metadataWriter; + + public void init(HoodieTableType tableType) throws IOException { + init(tableType, true); + } + + public void init(HoodieTableType tableType, boolean enableMetadataTable) throws IOException { + init(tableType, enableMetadataTable, true); + } + + public void init(HoodieTableType tableType, boolean enableMetadataTable, boolean enableFullScan) throws IOException { + this.tableType = tableType; + initPath(); + initSparkContexts("TestHoodieMetadata"); + initFileSystem(); + fs.mkdirs(new Path(basePath)); + initMetaClient(tableType); + initTestDataGenerator(); + metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + writeConfig = getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, enableMetadataTable, false, + enableFullScan).build(); + initWriteConfigAndMetatableWriter(writeConfig, enableMetadataTable); + } + + protected void initWriteConfigAndMetatableWriter(HoodieWriteConfig writeConfig, boolean enableMetadataTable) { + this.writeConfig = writeConfig; + if (enableMetadataTable) { + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, writeConfig, context); + testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + } else { + testTable = HoodieTestTable.of(metaClient); + } + } + + @AfterEach + public void clean() throws Exception { + cleanupResources(); + } + + protected void doWriteInsertAndUpsert(HoodieTestTable testTable, String commit1, String commit2) throws Exception { + testTable.doWriteOperation(commit1, INSERT, asList("p1", "p2"), asList("p1", "p2"), + 4, false); + testTable.doWriteOperation(commit2, UPSERT, asList("p1", "p2"), + 4, false); + validateMetadata(testTable); + } + + protected void doWriteOperationAndValidateMetadata(HoodieTestTable testTable, String commitTime) throws Exception { + doWriteOperation(testTable, commitTime); + validateMetadata(testTable); + } + + protected void doWriteOperation(HoodieTestTable testTable, String commitTime) throws Exception { + doWriteOperation(testTable, commitTime, UPSERT); + } + + protected void doWriteOperationAndValidate(HoodieTestTable testTable, String commitTime) throws Exception { + doWriteOperationAndValidate(testTable, commitTime, UPSERT); + } + + protected void doWriteOperationAndValidate(HoodieTestTable testTable, String commitTime, WriteOperationType operationType) throws Exception { + doWriteOperation(testTable, commitTime, operationType); + validateMetadata(testTable); + } + + protected void doWriteOperation(HoodieTestTable testTable, String commitTime, WriteOperationType operationType) throws Exception { + testTable.doWriteOperation(commitTime, operationType, emptyList(), asList("p1", "p2"), 3); + } + + protected void doClean(HoodieTestTable testTable, String commitTime, List commitsToClean) throws IOException { + doCleanInternal(testTable, commitTime, commitsToClean, false); + } + + protected void doCleanAndValidate(HoodieTestTable testTable, String commitTime, List commitsToClean) throws IOException { + doCleanInternal(testTable, commitTime, commitsToClean, true); + } + + private void doCleanInternal(HoodieTestTable testTable, String commitTime, List commitsToClean, boolean validate) throws IOException { + testTable.doCleanBasedOnCommits(commitTime, commitsToClean); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doCompaction(HoodieTestTable testTable, String commitTime) throws Exception { + doCompactionInternal(testTable, commitTime, false); + } + + protected void doCompactionAndValidate(HoodieTestTable testTable, String commitTime) throws Exception { + doCompactionInternal(testTable, commitTime, true); + } + + private void doCompactionInternal(HoodieTestTable testTable, String commitTime, boolean validate) throws Exception { + testTable.doCompaction(commitTime, asList("p1", "p2")); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doCluster(HoodieTestTable testTable, String commitTime) throws Exception { + doClusterInternal(testTable, commitTime, false); + } + + protected void doClusterAndValidate(HoodieTestTable testTable, String commitTime) throws Exception { + doClusterInternal(testTable, commitTime, true); + } + + protected void doClusterInternal(HoodieTestTable testTable, String commitTime, boolean validate) throws Exception { + testTable.doCluster(commitTime, new HashMap<>(), Arrays.asList("p1", "p2"), 2); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doRollback(HoodieTestTable testTable, String commitToRollback, String rollbackTime) throws Exception { + doRollbackInternal(testTable, commitToRollback, rollbackTime, false); + } + + protected void doRollbackAndValidate(HoodieTestTable testTable, String commitToRollback, String rollbackTime) throws Exception { + doRollbackInternal(testTable, commitToRollback, rollbackTime, true); + } + + private void doRollbackInternal(HoodieTestTable testTable, String commitToRollback, String rollbackTime, boolean validate) throws Exception { + testTable.doRollback(commitToRollback, rollbackTime); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doPreBootstrapWriteOperation(HoodieTestTable testTable, String commitTime) throws Exception { + doPreBootstrapWriteOperation(testTable, UPSERT, commitTime); + } + + protected void doPreBootstrapWriteOperation(HoodieTestTable testTable, WriteOperationType writeOperationType, String commitTime) throws Exception { + doPreBootstrapWriteOperation(testTable, writeOperationType, commitTime, 2); + } + + protected void doPreBootstrapWriteOperation(HoodieTestTable testTable, WriteOperationType writeOperationType, String commitTime, int filesPerPartition) throws Exception { + testTable.doWriteOperation(commitTime, writeOperationType, asList("p1", "p2"), asList("p1", "p2"), + filesPerPartition, true); + } + + protected void doPreBootstrapClean(HoodieTestTable testTable, String commitTime, List commitsToClean) throws Exception { + testTable.doCleanBasedOnCommits(commitTime, commitsToClean); + } + + protected void doPreBootstrapRollback(HoodieTestTable testTable, String rollbackTime, String commitToRollback) throws Exception { + testTable.doRollback(commitToRollback, rollbackTime); + } + + protected void doPrebootstrapCompaction(HoodieTestTable testTable, String commitTime) throws Exception { + doPrebootstrapCompaction(testTable, commitTime, Arrays.asList("p1", "p2")); + } + + protected void doPrebootstrapCompaction(HoodieTestTable testTable, String commitTime, List partitions) throws Exception { + testTable.doCompaction(commitTime, partitions); + } + + protected void doPreBootstrapCluster(HoodieTestTable testTable, String commitTime) throws Exception { + testTable.doCluster(commitTime, new HashMap<>(), Arrays.asList("p1", "p2"), 2); + } + + protected void doPreBootstrapRestore(HoodieTestTable testTable, String restoreTime, String commitToRestore) throws Exception { + testTable.doRestore(commitToRestore, restoreTime); + } + + protected void archiveDataTable(HoodieWriteConfig writeConfig, HoodieTableMetaClient metaClient) throws IOException { + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); + archiveLog.archiveIfRequired(context); + } + + protected void validateMetadata(HoodieTestTable testTable) throws IOException { + validateMetadata(testTable, emptyList()); + } + + protected void validateMetadata(HoodieTestTable testTable, boolean doFullValidation) throws IOException { + validateMetadata(testTable, emptyList(), doFullValidation); + } + + protected void validateMetadata(HoodieTestTable testTable, List inflightCommits) throws IOException { + validateMetadata(testTable, inflightCommits, false); + } + + protected void validateMetadata(HoodieTestTable testTable, List inflightCommits, boolean doFullValidation) throws IOException { + validateMetadata(testTable, inflightCommits, writeConfig, metadataTableBasePath, doFullValidation); + } + + protected HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata) { + return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false).build(); + } + + protected HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) { + return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics); + } + + protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, + boolean enableMetrics) { + return getWriteConfigBuilder(policy, autoCommit, useFileListingMetadata, enableMetrics, true); + } + + protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, + boolean enableMetrics, boolean enableFullScan) { + Properties properties = new Properties(); + properties.put(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key(), SimpleKeyGenerator.class.getName()); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2) + .withAutoCommit(autoCommit) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1) + .withFailedWritesCleaningPolicy(policy) + .withAutoClean(false).retainCommits(1).retainFileVersions(1).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).build()) + .withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table") + .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() + .withEnableBackupForRemoteFileSystemView(false).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(useFileListingMetadata) + .enableFullScan(enableFullScan) + .enableMetrics(enableMetrics).build()) + .withMetricsConfig(HoodieMetricsConfig.newBuilder().on(enableMetrics) + .withExecutorMetrics(true).build()) + .withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() + .usePrefix("unit-test").build()) + .withProperties(properties); + } + +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java new file mode 100644 index 0000000000000..12c8410c35e02 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.util.Arrays; + +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; + +@Tag("functional") +public class TestHoodieMetadataBootstrap extends TestHoodieMetadataBase { + + private static final Logger LOG = LogManager.getLogger(TestHoodieMetadataBootstrap.class); + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsert(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000003"); + } + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsertClean(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + doPreBootstrapClean(testTable, "0000003", Arrays.asList("0000001")); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000004"); + } + doPreBootstrapWriteOperation(testTable, "0000005"); + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsertRollback(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + doPreBootstrapRollback(testTable, "0000003", "0000002"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000004"); + } + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsertCluster(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + doPreBootstrapCluster(testTable, "0000003"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000004"); + } + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapLargeCommitList(HoodieTableType tableType) throws Exception { + init(tableType, false); + for (int i = 1; i < 25; i += 7) { + String commitTime1 = ((i > 9) ? ("00000") : ("000000")) + i; + String commitTime2 = ((i > 9) ? ("00000") : ("000000")) + (i + 1); + String commitTime3 = ((i > 9) ? ("00000") : ("000000")) + (i + 2); + String commitTime4 = ((i > 9) ? ("00000") : ("000000")) + (i + 3); + String commitTime5 = ((i > 9) ? ("00000") : ("000000")) + (i + 4); + String commitTime6 = ((i > 9) ? ("00000") : ("000000")) + (i + 5); + String commitTime7 = ((i > 9) ? ("00000") : ("000000")) + (i + 6); + doPreBootstrapWriteOperation(testTable, INSERT, commitTime1); + doPreBootstrapWriteOperation(testTable, commitTime2); + doPreBootstrapClean(testTable, commitTime3, Arrays.asList(commitTime1)); + doPreBootstrapWriteOperation(testTable, commitTime4); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, commitTime5); + } + doPreBootstrapWriteOperation(testTable, commitTime6); + doPreBootstrapRollback(testTable, commitTime7, commitTime6); + } + bootstrapAndVerify(); + } + + @Test + public void testMetadataBootstrapInflightCommit() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + // add an inflight commit + HoodieCommitMetadata inflightCommitMeta = testTable.doWriteOperation("00000007", UPSERT, emptyList(), + asList("p1", "p2"), 2, true, true); + // bootstrap and following validation should fail. bootstrap should not happen. + bootstrapAndVerifyFailure(); + + // once the commit is complete, metadata should get fully synced. + // in prod code path, SparkHoodieBackedTableMetadataWriter.create() will be called for every commit, + // which may not be the case here if we directly call HoodieBackedTableMetadataWriter.update() + // hence lets first move the commit to complete and invoke sync directly + ((HoodieMetadataTestTable) testTable).moveInflightCommitToComplete("00000007", inflightCommitMeta, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapArchival(HoodieTableType tableType) throws Exception { + init(tableType, false); + writeConfig = getWriteConfig(2, 4); + for (int i = 1; i < 13; i += 7) { + String commitTime1 = ((i > 9) ? ("00000") : ("000000")) + i; + String commitTime2 = ((i > 9) ? ("00000") : ("000000")) + (i + 1); + String commitTime3 = ((i > 9) ? ("00000") : ("000000")) + (i + 2); + String commitTime4 = ((i > 9) ? ("00000") : ("000000")) + (i + 3); + String commitTime5 = ((i > 9) ? ("00000") : ("000000")) + (i + 4); + String commitTime6 = ((i > 9) ? ("00000") : ("000000")) + (i + 5); + String commitTime7 = ((i > 9) ? ("00000") : ("000000")) + (i + 6); + doPreBootstrapWriteOperation(testTable, INSERT, commitTime1); + doPreBootstrapWriteOperation(testTable, commitTime2); + doPreBootstrapClean(testTable, commitTime3, Arrays.asList(commitTime1)); + doPreBootstrapWriteOperation(testTable, commitTime4); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, commitTime5); + } + doPreBootstrapWriteOperation(testTable, commitTime6); + doPreBootstrapRollback(testTable, commitTime7, commitTime6); + } + // archive and then bootstrap + archiveDataTable(writeConfig, metaClient); + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapAfterRestore(HoodieTableType tableType) throws Exception { + init(tableType, false); + testRestore(false); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapAfterRestoreAndUpserts(HoodieTableType tableType) throws Exception { + init(tableType, false); + testRestore(true); + } + + private void testRestore(boolean addUpsertsAfterRestore) throws Exception { + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000003"); + } + doPreBootstrapWriteOperation(testTable, "0000004"); + doPreBootstrapWriteOperation(testTable, "0000005"); + doPreBootstrapWriteOperation(testTable, "0000006"); + doPreBootstrapRestore(testTable, "0000007", "0000004"); + + if (addUpsertsAfterRestore) { + doPreBootstrapWriteOperation(testTable, "0000008"); + doPreBootstrapWriteOperation(testTable, "0000009"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000010"); + } + } + bootstrapAndVerify(); + } + + private void bootstrapAndVerify() throws Exception { + writeConfig = getWriteConfig(true, true); + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + // after bootstrap do two writes and validate its still functional. + doWriteInsertAndUpsert(testTable); + validateMetadata(testTable); + } + + private void bootstrapAndVerifyFailure() throws Exception { + writeConfig = getWriteConfig(true, true); + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + try { + validateMetadata(testTable); + Assertions.fail("Should have failed"); + } catch (IllegalStateException e) { + // expected + } + } + + private void doWriteInsertAndUpsert(HoodieTestTable testTable) throws Exception { + doWriteInsertAndUpsert(testTable, "0000100", "0000101"); + } + + private HoodieWriteConfig getWriteConfig(int minArchivalCommits, int maxArchivalCommits) throws Exception { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minArchivalCommits, maxArchivalCommits).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .forTable("test-trip-table").build(); + } + + @Override + protected HoodieTableType getTableType() { + return tableType; + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java index 0af28cc8d6ad9..2fb364187598b 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java @@ -30,10 +30,11 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.index.HoodieIndex.IndexType; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; -import org.apache.hudi.index.bloom.SparkHoodieGlobalBloomIndex; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex; import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; -import org.apache.hudi.index.simple.SparkHoodieSimpleIndex; +import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex; +import org.apache.hudi.index.simple.HoodieSimpleIndex; import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.JavaRDD; @@ -68,29 +69,29 @@ public void testCreateIndex(IndexType indexType) throws Exception { case INMEMORY: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkInMemoryHashIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieInMemoryHashIndex); break; case BLOOM: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieBloomIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieBloomIndex); break; case GLOBAL_BLOOM: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(IndexType.GLOBAL_BLOOM).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieGlobalBloomIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieGlobalBloomIndex); break; case SIMPLE: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(IndexType.SIMPLE).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieSimpleIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieSimpleIndex); break; case HBASE: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE) .withHBaseIndexConfig(new HoodieHBaseIndexConfig.Builder().build()).build()) .build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieHBaseIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof SparkHoodieHBaseIndex); break; default: // no -op. just for checkstyle errors @@ -103,7 +104,7 @@ public void testCreateDummyIndex() { HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); HoodieWriteConfig config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexClass(DummyHoodieIndex.class.getName()).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof DummyHoodieIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof DummyHoodieIndex); } @Test @@ -113,14 +114,14 @@ public void testCreateIndexWithException() { final HoodieWriteConfig config1 = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexClass(IndexWithConstructor.class.getName()).build()).build(); final Throwable thrown1 = assertThrows(HoodieException.class, () -> { - SparkHoodieIndex.createIndex(config1); + SparkHoodieIndexFactory.createIndex(config1); }, "exception is expected"); assertTrue(thrown1.getMessage().contains("is not a subclass of HoodieIndex")); final HoodieWriteConfig config2 = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexClass(IndexWithoutConstructor.class.getName()).build()).build(); final Throwable thrown2 = assertThrows(HoodieException.class, () -> { - SparkHoodieIndex.createIndex(config2); + SparkHoodieIndexFactory.createIndex(config2); }, "exception is expected"); assertTrue(thrown2.getMessage().contains("Unable to instantiate class")); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java index b325eb6b1c404..1334adb20d052 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java @@ -26,9 +26,12 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.testutils.RawTripTestPayload; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.io.HoodieKeyLookupHandle; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; @@ -103,7 +106,7 @@ private HoodieWriteConfig makeConfig(boolean rangePruning, boolean treeFiltering @MethodSource("configParams") public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); - SparkHoodieBloomIndex index = new SparkHoodieBloomIndex(config); + HoodieBloomIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); @@ -131,7 +134,7 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); - List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); // Still 0, as no valid commit assertEquals(0, filesList.size()); @@ -145,20 +148,20 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b if (rangePruning) { // these files will not have the key ranges - assertNull(filesList.get(0)._2().getMaxRecordKey()); - assertNull(filesList.get(0)._2().getMinRecordKey()); - assertFalse(filesList.get(1)._2().hasKeyRanges()); - assertNotNull(filesList.get(2)._2().getMaxRecordKey()); - assertNotNull(filesList.get(2)._2().getMinRecordKey()); - assertTrue(filesList.get(3)._2().hasKeyRanges()); + assertNull(filesList.get(0).getRight().getMaxRecordKey()); + assertNull(filesList.get(0).getRight().getMinRecordKey()); + assertFalse(filesList.get(1).getRight().hasKeyRanges()); + assertNotNull(filesList.get(2).getRight().getMaxRecordKey()); + assertNotNull(filesList.get(2).getRight().getMinRecordKey()); + assertTrue(filesList.get(3).getRight().hasKeyRanges()); // no longer sorted, but should have same files. - List> expected = - Arrays.asList(new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003"))); + List> expected = + Arrays.asList(new ImmutablePair<>("2016/04/01", new BloomIndexFileInfo("2")), + new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("1")), + new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), + new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003"))); assertEquals(expected, filesList); } } @@ -167,7 +170,7 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b @MethodSource("configParams") public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); - SparkHoodieBloomIndex index = new SparkHoodieBloomIndex(config); + HoodieBloomIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); final Map> partitionToFileIndexInfo = new HashMap<>(); partitionToFileIndexInfo.put("2017/10/22", @@ -179,12 +182,12 @@ public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolea jsc.parallelize(Arrays.asList(new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t); - List> comparisonKeyList = - index.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); + List> comparisonKeyList = HoodieJavaRDD.getJavaRDD( + index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, HoodieJavaPairRDD.of(partitionRecordKeyPairRDD))).collect(); assertEquals(10, comparisonKeyList.size()); Map> recordKeyToFileComps = comparisonKeyList.stream() - .collect(Collectors.groupingBy(t -> t._2.getRecordKey(), Collectors.mapping(t -> t._1, Collectors.toList()))); + .collect(Collectors.groupingBy(t -> t.getRight().getRecordKey(), Collectors.mapping(Pair::getLeft, Collectors.toList()))); assertEquals(4, recordKeyToFileComps.size()); assertEquals(new HashSet<>(Arrays.asList("f1", "f3", "f4")), new HashSet<>(recordKeyToFileComps.get("002"))); @@ -262,10 +265,10 @@ public void testTagLocationWithEmptyRDD(boolean rangePruning, boolean treeFilter HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); // Let's tag - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); assertDoesNotThrow(() -> { - bloomIndex.tagLocation(recordRDD, context, table); + tagLocation(bloomIndex, recordRDD, table); }, "EmptyRDD should not result in IllegalArgumentException: Positive number of slices required"); } @@ -301,8 +304,8 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); // Let's tag - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); - JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, context, hoodieTable); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD taggedRecordRDD = tagLocation(bloomIndex, recordRDD, hoodieTable); // Should not find any files for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -315,7 +318,7 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4); // We do the tag again - taggedRecordRDD = bloomIndex.tagLocation(recordRDD, context, HoodieSparkTable.create(config, context, metaClient)); + taggedRecordRDD = tagLocation(bloomIndex, recordRDD, HoodieSparkTable.create(config, context, metaClient)); // Check results for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -366,8 +369,9 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); // Let's tag - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); - JavaRDD taggedRecords = bloomIndex.tagLocation(keysRDD.map(k -> new HoodieRecord(k, null)), context, hoodieTable); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD taggedRecords = tagLocation( + bloomIndex, keysRDD.map(k -> new HoodieRecord(k, null)), hoodieTable); JavaPairRDD>> recordLocationsRDD = taggedRecords .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) @@ -387,7 +391,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean // We do the tag again metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - taggedRecords = bloomIndex.tagLocation(keysRDD.map(k -> new HoodieRecord(k, null)), context, hoodieTable); + taggedRecords = tagLocation(bloomIndex, keysRDD.map(k -> new HoodieRecord(k, null)), hoodieTable); recordLocationsRDD = taggedRecords .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) @@ -443,8 +447,8 @@ public void testBloomFilterFalseError(boolean rangePruning, boolean treeFilterin metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieSparkTable.create(config, context, metaClient); - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); - JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, context, table); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD taggedRecordRDD = tagLocation(bloomIndex, recordRDD, table); // Check results for (HoodieRecord record : taggedRecordRDD.collect()) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java index 3970ab2502c79..fa7d586d2dc0a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java @@ -22,8 +22,12 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; @@ -74,7 +78,8 @@ public void tearDown() throws IOException { @Test public void testLoadInvolvedFiles() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieGlobalBloomIndex index = + new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); @@ -104,7 +109,7 @@ public void testLoadInvolvedFiles() throws Exception { // intentionally missed the partition "2015/03/12" to see if the GlobalBloomIndex can pick it up List partitions = Arrays.asList("2016/01/21", "2016/04/01"); // partitions will NOT be respected by this loadInvolvedFiles(...) call - List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); // Still 0, as no valid commit assertEquals(0, filesList.size()); @@ -138,7 +143,8 @@ public void testLoadInvolvedFiles() throws Exception { public void testExplodeRecordRDDWithFileComparisons() { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieGlobalBloomIndex index = + new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); final Map> partitionToFileIndexInfo = new HashMap<>(); partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"), @@ -152,8 +158,9 @@ public void testExplodeRecordRDDWithFileComparisons() { jsc.parallelize(Arrays.asList(new Tuple2<>("2017/10/21", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/23", "004"))).mapToPair(t -> t); - List> comparisonKeyList = - index.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); + List> comparisonKeyList = HoodieJavaRDD.getJavaRDD( + index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, + HoodieJavaPairRDD.of(partitionRecordKeyPairRDD))).collect(); /* * expecting: f4, HoodieKey { recordKey=003 partitionPath=2017/10/23} f1, HoodieKey { recordKey=003 @@ -166,7 +173,7 @@ public void testExplodeRecordRDDWithFileComparisons() { assertEquals(10, comparisonKeyList.size()); Map> recordKeyToFileComps = comparisonKeyList.stream() - .collect(Collectors.groupingBy(t -> t._2.getRecordKey(), Collectors.mapping(Tuple2::_1, Collectors.toList()))); + .collect(Collectors.groupingBy(t -> t.getRight().getRecordKey(), Collectors.mapping(Pair::getKey, Collectors.toList()))); assertEquals(4, recordKeyToFileComps.size()); assertEquals(new HashSet<>(Arrays.asList("f4", "f1", "f3")), new HashSet<>(recordKeyToFileComps.get("002"))); @@ -179,7 +186,8 @@ public void testExplodeRecordRDDWithFileComparisons() { public void testTagLocation() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) .withIndexConfig(HoodieIndexConfig.newBuilder().withBloomIndexUpdatePartitionPath(false).build()).build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieGlobalBloomIndex index = + new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); @@ -224,7 +232,7 @@ public void testTagLocation() throws Exception { String fileId4 = testTable.addCommit("4000").getFileIdWithInserts("2015/03/12", record4); // partitions will NOT be respected by this loadInvolvedFiles(...) call - JavaRDD taggedRecordRDD = index.tagLocation(recordRDD, context, hoodieTable); + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); for (HoodieRecord record : taggedRecordRDD.collect()) { switch (record.getRecordKey()) { @@ -260,7 +268,8 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { .withPath(basePath) .withIndexConfig(HoodieIndexConfig.newBuilder().withBloomIndexUpdatePartitionPath(true).build()) .build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieGlobalBloomIndex index = + new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); final String p1 = "2016/01/31"; @@ -304,7 +313,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { // test against incoming record with a different partition JavaRDD recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord)); - JavaRDD taggedRecordRDD = index.tagLocation(recordRDD, context, hoodieTable); + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); assertEquals(2, taggedRecordRDD.count()); for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -325,7 +334,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { // test against incoming record with the same partition JavaRDD recordRDDSamePartition = jsc .parallelize(Collections.singletonList(incomingRecordSamePartition)); - JavaRDD taggedRecordRDDSamePartition = index.tagLocation(recordRDDSamePartition, context, hoodieTable); + JavaRDD taggedRecordRDDSamePartition = tagLocation(index, recordRDDSamePartition, hoodieTable); assertEquals(1, taggedRecordRDDSamePartition.count()); HoodieRecord record = taggedRecordRDDSamePartition.first(); @@ -335,10 +344,10 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { } // convert list to map to avoid sorting order dependencies - private static Map toFileMap(List> filesList) { + private static Map toFileMap(List> filesList) { Map filesMap = new HashMap<>(); - for (Tuple2 t : filesList) { - filesMap.put(t._1() + "/" + t._2().getFileId(), t._2()); + for (Pair t : filesList) { + filesMap.put(t.getKey() + "/" + t.getValue().getFileId(), t.getValue()); } return filesMap; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java index cdd3fa526b914..7cb9740a8c6cc 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java @@ -24,23 +24,28 @@ import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.client.utils.MetadataConversionUtils; import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTimelineArchiveLog; @@ -48,19 +53,23 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; -import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -71,262 +80,137 @@ public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness { + private static final Logger LOG = LogManager.getLogger(TestHoodieTimelineArchiveLog.class); + private Configuration hadoopConf; private HoodieWrapperFileSystem wrapperFs; + private HoodieTableMetadataWriter metadataWriter; + private HoodieTestTable testTable; - @BeforeEach public void init() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + } + + public void init(HoodieTableType tableType) throws Exception { initPath(); initSparkContexts(); initMetaClient(); hadoopConf = context.getHadoopConf().get(); metaClient.getFs().mkdirs(new Path(basePath)); - metaClient = HoodieTestUtils.init(hadoopConf, basePath); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); wrapperFs = metaClient.getFs(); hadoopConf.addResource(wrapperFs.getConf()); } + private void initWriteConfigAndMetatableWriter(HoodieWriteConfig writeConfig, boolean enableMetadataTable) { + if (enableMetadataTable) { + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, writeConfig, context); + testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + } else { + testTable = HoodieTestTable.of(metaClient); + } + } + @AfterEach public void clean() throws IOException { cleanupResources(); } - @Test - public void testArchiveEmptyTable() throws IOException { - HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table").build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, int minArchivalCommits, int maxArchivalCommits, int maxDeltaCommitsMetadataTable) throws Exception { + return initTestTableAndGetWriteConfig(enableMetadata, minArchivalCommits, maxArchivalCommits, maxDeltaCommitsMetadataTable, HoodieTableType.COPY_ON_WRITE); } - @Test - public void testArchiveTableWithArchival() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, int minArchivalCommits, int maxArchivalCommits, int maxDeltaCommitsMetadataTable, + HoodieTableType tableType) throws Exception { + init(tableType); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 4).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minArchivalCommits, maxArchivalCommits).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata) + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsMetadataTable).build()) .forTable("test-trip-table").build(); - HoodieTestUtils.init(hadoopConf, basePath); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "104"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "105"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "105"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - - assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - - createCleanMetadata("100", false); - createCleanMetadata("101", false); - createCleanMetadata("102", false); - createCleanMetadata("103", false); - createCleanMetadata("104", false); - createCleanMetadata("105", false); - createCleanMetadata("106", true); - createCleanMetadata("107", true); - - // reload the timeline and get all the commmits before archive - timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); - List originalCommits = timeline.getInstants().collect(Collectors.toList()); - - assertEquals(12, timeline.countInstants(), "Loaded 6 commits and the count should match"); - - // verify in-flight instants before archive - verifyInflightInstants(metaClient, 2); - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - assertTrue(archiveLog.archiveIfRequired(context)); - - // reload the timeline and remove the remaining commits - timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); - originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); - - // Check compaction instants - List instants = metaClient.scanHoodieInstantsFromFileSystem( - new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE, false); - assertEquals(4, instants.size(), "Should delete all compaction instants < 104"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100")), - "Requested Compaction must be absent for 100"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100")), - "Inflight Compaction must be absent for 100"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101")), - "Requested Compaction must be absent for 101"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101")), - "Inflight Compaction must be absent for 101"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102")), - "Requested Compaction must be absent for 102"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102")), - "Inflight Compaction must be absent for 102"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103")), - "Requested Compaction must be absent for 103"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103")), - "Inflight Compaction must be absent for 103"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104")), - "Requested Compaction must be present for 104"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "104")), - "Inflight Compaction must be present for 104"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "105")), - "Requested Compaction must be present for 105"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "105")), - "Inflight Compaction must be present for 105"); - - // read the file - HoodieArchivedTimeline archivedTimeline = new HoodieArchivedTimeline(metaClient); - assertEquals(24, archivedTimeline.countInstants(), - "Total archived records and total read records are the same count"); - - //make sure the archived commits are the same as the (originalcommits - commitsleft) - Set readCommits = - archivedTimeline.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); - assertEquals(originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()), readCommits, - "Read commits map should match the originalCommits - commitsLoadedFromArchival"); - - // verify in-flight instants after archive - verifyInflightInstants(metaClient, 2); + initWriteConfigAndMetatableWriter(writeConfig, enableMetadata); + return writeConfig; } @Test - public void testArchiveTableWithNoArchival() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); + public void testArchiveEmptyTable() throws Exception { + init(); + HoodieWriteConfig cfg = + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table").build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - assertEquals(4, timeline.countInstants(), "Loaded 4 commits and the count should match"); boolean result = archiveLog.archiveIfRequired(context); assertTrue(result); - timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); - assertEquals(4, timeline.countInstants(), "Should not archive commits when maxCommitsToKeep is 5"); - - List instants = metaClient.scanHoodieInstantsFromFileSystem( - new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE, false); - assertEquals(8, instants.size(), "Should not delete any aux compaction files when maxCommitsToKeep is 5"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100")), - "Requested Compaction must be present for 100"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100")), - "Inflight Compaction must be present for 100"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101")), - "Requested Compaction must be present for 101"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101")), - "Inflight Compaction must be present for 101"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102")), - "Requested Compaction must be present for 102"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102")), - "Inflight Compaction must be present for 102"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103")), - "Requested Compaction must be present for 103"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103")), - "Inflight Compaction must be present for 103"); } - @Test - public void testArchiveCommitSafety() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveTableWithArchival(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2); + + // min archival commits is 2 and max archival commits is 4. and so, after 5th commit, 3 commits will be archived. + // 1,2,3,4,5 : after archival -> 4,5 + // after 3 more commits, earliest 3 will be archived + // 4,5,6,7,8 : after archival -> 7, 8 + // after 9 no-op wrt archival. + for (int i = 1; i < 10; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + if (i < 5) { + assertEquals(originalCommits, commitsAfterArchival); + } else if (i == 5) { + // archival should have kicked in. + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003")), getActiveCommitInstants(Arrays.asList("00000004", "00000005")), commitsAfterArchival); + } else if (i < 8) { + assertEquals(originalCommits, commitsAfterArchival); + } else if (i == 8) { + // archival should have kicked in. + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004", "00000005", "00000006")), + getActiveCommitInstants(Arrays.asList("00000007", "00000008")), commitsAfterArchival); + } else { + assertEquals(originalCommits, commitsAfterArchival); + } + } + } - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); - assertTrue(timeline.containsOrBeforeTimelineStarts("100"), "Archived commits should always be safe"); - assertTrue(timeline.containsOrBeforeTimelineStarts("101"), "Archived commits should always be safe"); - assertTrue(timeline.containsOrBeforeTimelineStarts("102"), "Archived commits should always be safe"); - assertTrue(timeline.containsOrBeforeTimelineStarts("103"), "Archived commits should always be safe"); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testNoArchivalUntilMaxArchiveConfigWithExtraInflightCommits(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 5, 2); + + // when max archival commits is set to 5, until 6th commit there should not be any archival. + for (int i = 1; i < 6; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 2); + // archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + // add couple of inflight. no archival should kick in. + testTable.doWriteOperation("00000006", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 2, false, true); + testTable.doWriteOperation("00000007", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 2, false, true); + + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); } @Test - public void testArchiveCommitSavepointNoHole() throws IOException { + public void testArchiveCommitSavepointNoHole() throws Exception { + init(); HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .build(); HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); @@ -353,89 +237,99 @@ public void testArchiveCommitSavepointNoHole() throws IOException { "Archived commits should always be safe"); } - @Test - public void testArchiveRollbacks() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) - .build(); - - createCommitAndRollbackFile("100", "101", false); - createCommitAndRollbackFile("102", "103", false); - createCommitAndRollbackFile("104", "105", false); - createCommitAndRollbackFile("106", "107", false); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveRollbacksTestTable(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 3, 2); + + for (int i = 1; i < 9; i += 2) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + testTable.doRollback("0000000" + i, "0000000" + (i + 1)); + + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + if (i != 7) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + // only time when archival will kick in + List expectedArchivedInstants = new ArrayList<>(); + expectedArchivedInstants.addAll(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000003"))); + expectedArchivedInstants.addAll(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000004"), HoodieTimeline.ROLLBACK_ACTION)); + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000005", "00000007"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000006", "00000008"), HoodieTimeline.ROLLBACK_ACTION)); + verifyArchival(expectedArchivedInstants, expectedActiveInstants, commitsAfterArchival); + } + } + } - HoodieTable table = HoodieSparkTable.create(cfg, context); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testNoArchivalWithInflightCompactionInMiddle(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2, + HoodieTableType.MERGE_ON_READ); + + // when max archival commits is set to 4, even after 7 commits, if there is an inflight compaction in the middle, archival should not kick in. + HoodieCommitMetadata inflightCompactionMetadata = null; + for (int i = 1; i < 8; i++) { + if (i == 2) { + inflightCompactionMetadata = testTable.doCompaction("0000000" + i, Arrays.asList("p1", "p2"), true); + } else { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + + // archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + if (i != 6) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + // on 6th commit, archival will kick in. but will archive only one commit since 2nd compaction commit is inflight. + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 1); + for (int j = 1; j <= 6; j++) { + if (j == 1) { + // first commit should be archived + assertFalse(commitsAfterArchival.contains(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j))); + } else if (j == 2) { + // 2nd compaction should not be archived + assertFalse(commitsAfterArchival.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "0000000" + j))); + } else { + // every other commit should not be archived + assertTrue(commitsAfterArchival.contains(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j))); + } + } + } + } - assertTrue(archiveLog.archiveIfRequired(context)); - HoodieTimeline timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); - assertEquals(2, timeline.countInstants(), - "first two commits must have been archived"); - assertFalse(metaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.ROLLBACK_ACTION, "101")), - "first rollback must have been archived"); - assertFalse(metaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.ROLLBACK_ACTION, "103")), - "second rollback must have been archived"); - assertTrue(metaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.ROLLBACK_ACTION, "105")), - "first rollback must have been archived"); - assertTrue(metaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.ROLLBACK_ACTION, "107")), - "second rollback must have been archived"); - } + // move inflight compaction to complete and add one regular write commit. archival should archive more commits. + // an extra one commit is required, bcoz compaction in data table will not trigger table services in metadata table. + // before this move, timeline : 2_inflight_compaction, 3,4,5,6,7. + // after this move: 6,7,8 (2,3,4,5 will be archived) + testTable.moveInflightCompactionToComplete("00000002", inflightCompactionMetadata); + testTable.doWriteOperation("00000008", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), 2); - @Test - public void testArchiveCommitCompactionNoHole() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionRequestedFile(basePath, "101", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionRequestedFile(basePath, "104", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "106", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "107", wrapperFs.getConf()); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List commitsAfterArchival = commitsList.getValue(); - HoodieTimeline timeline = metaClient.getActiveTimeline().getWriteTimeline(); - assertEquals(8, timeline.countInstants(), "Loaded 6 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - timeline = metaClient.getActiveTimeline().reload().getWriteTimeline(); - assertFalse(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "100")), - "Instants before oldest pending compaction can be removed"); - assertEquals(7, timeline.countInstants(), - "Since we have a pending compaction at 101, we should never archive any commit " - + "after 101 (we only archive 100)"); - assertTrue(timeline.containsInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101")), - "Requested Compaction must still be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "105")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "106")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "107")), - "Instants greater than oldest pending compaction must be present"); + List archivedInstants = getAllArchivedCommitInstants(Arrays.asList("00000001", "00000003", "00000004", "00000005", "00000006"), HoodieTimeline.DELTA_COMMIT_ACTION); + archivedInstants.add(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "00000002")); + archivedInstants.add(new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000002")); + verifyArchival(archivedInstants, getActiveCommitInstants(Arrays.asList("00000007", "00000008"), HoodieTimeline.DELTA_COMMIT_ACTION), commitsAfterArchival); } @Test - public void testArchiveCommitTimeline() throws IOException { + public void testArchiveCommitTimeline() throws Exception { + init(); HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) - .build(); + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTestDataGenerator.createCommitFile(basePath, "1", wrapperFs.getConf()); @@ -470,7 +364,8 @@ private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expect } @Test - public void testConvertCommitMetadata() { + public void testConvertCommitMetadata() throws Exception { + init(); HoodieCommitMetadata hoodieCommitMetadata = new HoodieCommitMetadata(); hoodieCommitMetadata.setOperationType(WriteOperationType.INSERT); @@ -481,110 +376,116 @@ public void testConvertCommitMetadata() { assertEquals(expectedCommitMetadata.getOperationType(), WriteOperationType.INSERT.toString()); } - @Test - public void testArchiveCompletedClean() throws IOException { - HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - - createCleanMetadata("10", false); - createCleanMetadata("11", false); - HoodieInstant notArchivedInstant1 = createCleanMetadata("12", false); - HoodieInstant notArchivedInstant2 = createCleanMetadata("13", false); - - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - - archiveLog.archiveIfRequired(context); - - List notArchivedInstants = metaClient.getActiveTimeline().reload().getInstants().collect(Collectors.toList()); - //There will be 3 * 2 files but due to TimelineLayoutV1 this will show as 2. - assertEquals(2, notArchivedInstants.size(), "Not archived instants should be 2"); - assertEquals(notArchivedInstants, Arrays.asList(notArchivedInstant1, notArchivedInstant2), ""); - } - - @Test - public void testArchiveCompletedRollback() throws IOException { - HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - - createCommitAndRollbackFile("6", "10", false); - createCommitAndRollbackFile("8", "11", false); - createCommitAndRollbackFile("7", "12", false); - HoodieInstant notArchivedInstant1 = new HoodieInstant(State.COMPLETED, "rollback", "12"); - - createCommitAndRollbackFile("5", "13", false); - HoodieInstant notArchivedInstant2 = new HoodieInstant(State.COMPLETED, "rollback", "13"); - - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - - archiveLog.archiveIfRequired(context); - - List notArchivedInstants = metaClient.getActiveTimeline().reload().getRollbackTimeline().getInstants().collect(Collectors.toList()); - //There will be 2 * 2 files but due to TimelineLayoutV1 this will show as 2. - assertEquals(2, notArchivedInstants.size(), "Not archived instants should be 2"); - assertEquals(notArchivedInstants, Arrays.asList(notArchivedInstant1, notArchivedInstant2), ""); - } - - @Test - public void testArchiveCompletedShouldRetainMinInstantsIfInstantsGreaterThanMaxtoKeep() throws IOException { - int minInstants = 2; - int maxInstants = 10; - HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minInstants, maxInstants).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - for (int i = 0; i < maxInstants + 2; i++) { - createCleanMetadata(i + "", false); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveTableWithCleanCommits(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2); + + // min archival commits is 2 and max archival commits is 4(either clean commits has to be > 4 or commits has to be greater than 4. + // and so, after 5th commit, 3 commits will be archived. + // 1,2,3,4,5,6 : after archival -> 1,5,6 (because, 2,3,4,5 and 6 are clean commits and are eligible for archival) + // after 7th and 8th commit no-op wrt archival. + Map cleanStats = new HashMap<>(); + cleanStats.put("p1", 1); + cleanStats.put("p2", 2); + for (int i = 1; i < 9; i++) { + if (i == 1) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 10); + } else if (i < 7) { + testTable.doClean("0000000" + i, cleanStats); + } else { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + if (i < 6) { + assertEquals(originalCommits, commitsAfterArchival); + } else if (i == 6) { + if (!enableMetadata) { + // 1,2,3,4,5,6 : after archival -> 1,5,6 (bcoz, 2,3,4,5 and 6 are clean commits and are eligible for archival) + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000005", "00000006"), HoodieTimeline.CLEAN_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000003", "00000004"), HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); + } else { + // with metadata enabled, archival in data table is fenced based on compaction in metadata table. Clean commits in data table will not trigger compaction in + // metadata table. + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000002", "00000003", "00000004", "00000005", "00000006"), HoodieTimeline.CLEAN_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Collections.emptyList(), HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); + } + } else { + if (!enableMetadata) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + if (i == 7) { + // when i == 7 compaction in metadata table will be triggered and hence archival in datatable will kick in. + // 1,2,3,4,5,6 : after archival -> 1,5,6 (bcoz, 2,3,4,5 and 6 are clean commits and are eligible for archival) + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001", "00000007"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000005", "00000006"), HoodieTimeline.CLEAN_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000003", "00000004"), HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); + } else { + assertEquals(originalCommits, commitsAfterArchival); + } + } + } } - - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - - archiveLog.archiveIfRequired(context); - assertEquals(minInstants, metaClient.getActiveTimeline().reload().getInstants().count()); } @Test - public void testArchiveCompletedShouldNotArchiveIfInstantsLessThanMaxtoKeep() throws IOException { - int minInstants = 2; - int maxInstants = 10; - HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minInstants, maxInstants).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - for (int i = 0; i < maxInstants; i++) { - createCleanMetadata(i + "", false); + public void testArchiveRollbacksAndCleanTestTable() throws Exception { + boolean enableMetadata = false; + int minArchiveCommits = 2; + int maxArchiveCommits = 9; + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, minArchiveCommits, maxArchiveCommits, 2); + + // trigger 1 commit to add lot of files so that future cleans can clean them up + testTable.doWriteOperation("00000001", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 20); + + Map partitionToFileDeleteCount = new HashMap<>(); + partitionToFileDeleteCount.put("p1", 1); + partitionToFileDeleteCount.put("p2", 1); + // we are triggering 10 clean commits. (1 is commit, 2 -> 11 is clean) + for (int i = 2; i <= (maxArchiveCommits + 2); i++) { + testTable.doClean((i > 9 ? ("000000") : ("0000000")) + i, partitionToFileDeleteCount); } - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); + // we are triggering 7 commits and 7 rollbacks for the same + for (int i = 12; i <= (2 * maxArchiveCommits); i += 2) { + testTable.doWriteOperation("000000" + i, WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + testTable.doRollback("000000" + i, "000000" + (i + 1)); + } - archiveLog.archiveIfRequired(context); - assertEquals(maxInstants, metaClient.getActiveTimeline().reload().getInstants().count()); + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + // out of 10 clean commits, 8 will be archived. 2 to 9. 10 and 11 will be active. + // wrt regular commits, there aren't 9 commits yet and so all of them will be active. + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000010", "00000011"), HoodieTimeline.CLEAN_ACTION)); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001", "00000012", "00000014", "00000016", "00000018"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000013", "00000015", "00000017", "00000019"), HoodieTimeline.ROLLBACK_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000003", "00000004", "00000005", "00000006", "00000007", "00000008", "00000009"), + HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); } @Test - public void testArchiveCompletedRollbackAndClean() throws IOException { + public void testArchiveCompletedRollbackAndClean() throws Exception { + init(); int minInstantsToKeep = 2; int maxInstantsToKeep = 10; HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minInstantsToKeep, maxInstantsToKeep).build()) - .build(); + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minInstantsToKeep, maxInstantsToKeep).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); metaClient = HoodieTableMetaClient.reload(metaClient); int startInstant = 1; @@ -612,12 +513,14 @@ public void testArchiveCompletedRollbackAndClean() throws IOException { } @Test - public void testArchiveInflightClean() throws IOException { + public void testArchiveInflightClean() throws Exception { + init(); HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) - .build(); + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); metaClient = HoodieTableMetaClient.reload(metaClient); createCleanMetadata("10", false); @@ -636,6 +539,137 @@ public void testArchiveInflightClean() throws IOException { assertEquals(notArchivedInstants, Arrays.asList(notArchivedInstant1, notArchivedInstant2, notArchivedInstant3), ""); } + @Test + public void testArchiveTableWithMetadataTableCompaction() throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 4, 7); + + // min archival commits is 2 and max archival commits is 4. and so, after 5th commit, ideally archival should kick in. but max delta commits in metadata table is set to 6. and so + // archival will kick in only by 7th commit in datatable(1 commit for bootstrap + 6 commits from data table). + // and then 2nd compaction will take place + for (int i = 1; i < 6; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + // one more commit will trigger compaction in metadata table and will let archival move forward. + testTable.doWriteOperation("00000006", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + // before archival 1,2,3,4,5,6 + // after archival 5,6 + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 4); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004")), getActiveCommitInstants(Arrays.asList("00000005", "00000006")), commitsAfterArchival); + + // 3 more commits, 5 and 6 will be archived. but will not move after 6 since compaction has to kick in in metadata table. + testTable.doWriteOperation("00000007", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + testTable.doWriteOperation("00000008", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + + // ideally, this will archive commits 5, 6, 7, but since compaction in metadata is until 6, only 5 and 6 will get archived, + testTable.doWriteOperation("00000009", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 2); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004", "00000005", "00000006")), + getActiveCommitInstants(Arrays.asList("00000007", "00000008", "00000009")), commitsAfterArchival); + + // and then 2nd compaction will take place at 12th commit + for (int i = 10; i < 13; i++) { + testTable.doWriteOperation("000000" + i, WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + // one more commit will trigger compaction in metadata table and will let archival move forward. + testTable.doWriteOperation("00000013", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + // before archival 5,6,7,8,9,10,11,12,13 + // after archival 12,13 + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 5); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004", "00000005", "00000006", "00000007", "00000008", + "00000009", "00000010", "00000011")), getActiveCommitInstants(Arrays.asList("00000012", "00000013")), commitsAfterArchival); + } + + private Pair, List> archiveAndGetCommitsList(HoodieWriteConfig writeConfig) throws IOException { + metaClient.reloadActiveTimeline(); + HoodieTimeline timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); + List originalCommits = timeline.getInstants().collect(Collectors.toList()); + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); + archiveLog.archiveIfRequired(context); + timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); + List commitsAfterArchival = timeline.getInstants().collect(Collectors.toList()); + return Pair.of(originalCommits, commitsAfterArchival); + } + + private void verifyArchival(List expectedArchivedInstants, List expectedActiveInstants, List commitsAfterArchival) { + Collections.sort(expectedActiveInstants, Comparator.comparing(HoodieInstant::getTimestamp)); + Collections.sort(commitsAfterArchival, Comparator.comparing(HoodieInstant::getTimestamp)); + assertEquals(expectedActiveInstants, commitsAfterArchival); + expectedArchivedInstants.forEach(entry -> assertFalse(commitsAfterArchival.contains(entry))); + HoodieArchivedTimeline archivedTimeline = new HoodieArchivedTimeline(metaClient); + List actualArchivedInstants = archivedTimeline.getInstants().collect(Collectors.toList()); + Collections.sort(actualArchivedInstants, Comparator.comparing(HoodieInstant::getTimestamp)); + Collections.sort(expectedArchivedInstants, Comparator.comparing(HoodieInstant::getTimestamp)); + assertEquals(actualArchivedInstants, expectedArchivedInstants); + + HoodieTimeline timeline = metaClient.getActiveTimeline(); + expectedArchivedInstants.forEach(entry -> { + // check safety + if (entry.getAction() != HoodieTimeline.ROLLBACK_ACTION) { + assertTrue(timeline.containsOrBeforeTimelineStarts(entry.getTimestamp()), "Archived commits should always be safe"); + } + } + ); + } + + private List getArchivedInstants(HoodieInstant instant) { + List instants = new ArrayList<>(); + if (instant.getAction() == HoodieTimeline.COMMIT_ACTION || instant.getAction() == HoodieTimeline.DELTA_COMMIT_ACTION || instant.getAction() == HoodieTimeline.CLEAN_ACTION) { + instants.add(new HoodieInstant(State.REQUESTED, instant.getAction(), instant.getTimestamp())); + } + instants.add(new HoodieInstant(State.INFLIGHT, instant.getAction(), instant.getTimestamp())); + instants.add(new HoodieInstant(State.COMPLETED, instant.getAction(), instant.getTimestamp())); + return instants; + } + + private List getAllArchivedCommitInstants(List commitTimes) { + return getAllArchivedCommitInstants(commitTimes, HoodieTimeline.COMMIT_ACTION); + } + + private List getAllArchivedCommitInstants(List commitTimes, String action) { + List allInstants = new ArrayList<>(); + commitTimes.forEach(entry -> allInstants.addAll(getArchivedInstants(new HoodieInstant(State.COMPLETED, action, entry)))); + return allInstants; + } + + private List getActiveCommitInstants(List commitTimes) { + return getActiveCommitInstants(commitTimes, HoodieTimeline.COMMIT_ACTION); + } + + private List getActiveCommitInstants(List commitTimes, String action) { + List allInstants = new ArrayList<>(); + commitTimes.forEach(entry -> allInstants.add(new HoodieInstant(State.COMPLETED, action, entry))); + return allInstants; + } + private HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException { HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>()); @@ -665,14 +699,14 @@ private HoodieInstant createRollbackMetadata(String rollbackTime, String commitT HoodieTestTable.of(metaClient).addInflightRollback(rollbackTime); } else { HoodieRollbackMetadata hoodieRollbackMetadata = HoodieRollbackMetadata.newBuilder() - .setVersion(1) - .setStartRollbackTime(rollbackTime) - .setTotalFilesDeleted(1) - .setTimeTakenInMillis(1000) - .setCommitsRollback(Collections.singletonList(commitToRollback)) - .setPartitionMetadata(Collections.emptyMap()) - .setInstantsRollback(Collections.emptyList()) - .build(); + .setVersion(1) + .setStartRollbackTime(rollbackTime) + .setTotalFilesDeleted(1) + .setTimeTakenInMillis(1000) + .setCommitsRollback(Collections.singletonList(commitToRollback)) + .setPartitionMetadata(Collections.emptyMap()) + .setInstantsRollback(Collections.emptyList()) + .build(); HoodieTestTable.of(metaClient).addRollback(rollbackTime, hoodieRollbackMetadata); } return new HoodieInstant(inflight, "rollback", rollbackTime); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java index fafa3fad18da5..de55555434655 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java @@ -18,14 +18,14 @@ package org.apache.hudi.io.storage.row; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.testutils.HoodieClientTestHarness; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hudi.testutils.SparkDatasetTestUtils; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.spark.sql.Dataset; @@ -33,7 +33,8 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import java.util.List; import java.util.Random; @@ -62,12 +63,14 @@ public void tearDown() throws Exception { cleanupResources(); } - @Test - public void endToEndTest() throws Exception { - HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build(); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void endToEndTest(boolean parquetWriteLegacyFormatEnabled) throws Exception { + HoodieWriteConfig.Builder writeConfigBuilder = SparkDatasetTestUtils.getConfigBuilder(basePath); for (int i = 0; i < 5; i++) { // init write support and parquet config - HoodieRowParquetWriteSupport writeSupport = getWriteSupport(cfg, hadoopConf); + HoodieRowParquetWriteSupport writeSupport = getWriteSupport(writeConfigBuilder, hadoopConf, parquetWriteLegacyFormatEnabled); + HoodieWriteConfig cfg = writeConfigBuilder.build(); HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport, CompressionCodecName.SNAPPY, cfg.getParquetBlockSize(), cfg.getParquetPageSize(), cfg.getParquetMaxFileSize(), writeSupport.getHadoopConf(), cfg.getParquetCompressionRatio()); @@ -101,12 +104,14 @@ public void endToEndTest() throws Exception { } } - private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig writeConfig, Configuration hadoopConf) { + private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig.Builder writeConfigBuilder, Configuration hadoopConf, boolean parquetWriteLegacyFormatEnabled) { + writeConfigBuilder.withStorageConfig(HoodieStorageConfig.newBuilder().parquetWriteLegacyFormat(String.valueOf(parquetWriteLegacyFormatEnabled)).build()); + HoodieWriteConfig writeConfig = writeConfigBuilder.build(); BloomFilter filter = BloomFilterFactory.createBloomFilter( writeConfig.getBloomFilterNumEntries(), writeConfig.getBloomFilterFPP(), writeConfig.getDynamicBloomFilterMaxNumEntries(), writeConfig.getBloomFilterType()); - return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, filter); + return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, filter, writeConfig); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java index 56f4eeef3402a..76a91ef124bb7 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java @@ -19,6 +19,7 @@ package org.apache.hudi.io.storage.row; import org.apache.hudi.client.HoodieInternalWriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -168,7 +169,8 @@ public void testGlobalFailure() throws Exception { @Test public void testInstantiationFailure() throws IOException { // init config and table - HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).withPath("/dummypath/abc/").build(); + HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .withPath("/dummypath/abc/").build(); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); try { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java index 4bfc71fa353bc..4b590d9374c8e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java @@ -33,6 +33,8 @@ import org.junit.jupiter.api.Test; import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; public class TestCustomKeyGenerator extends KeyGeneratorTestUtilities { @@ -122,6 +124,13 @@ private TypedProperties getPropertiesForNonPartitionedKeyGen(boolean useKeyGener return properties; } + private String stackTraceToString(Throwable e) { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + e.printStackTrace(pw); + return sw.toString(); + } + @Test public void testSimpleKeyGeneratorWithKeyGeneratorClass() throws IOException { testSimpleKeyGenerator(getPropertiesForSimpleKeyGen(true)); @@ -259,7 +268,7 @@ public void testNoRecordKeyFieldProp(boolean useKeyGeneratorClassName) { .getMessage() .contains("Property hoodie.datasource.write.recordkey.field not found")); } else { - Assertions.assertTrue(e.getMessage().contains("Property hoodie.datasource.write.recordkey.field not found")); + Assertions.assertTrue(stackTraceToString(e).contains("Property hoodie.datasource.write.recordkey.field not found")); } } @@ -282,7 +291,7 @@ public void testNoRecordKeyFieldProp(boolean useKeyGeneratorClassName) { .getMessage() .contains("Property hoodie.datasource.write.recordkey.field not found")); } else { - Assertions.assertTrue(e.getMessage().contains("Property hoodie.datasource.write.recordkey.field not found")); + Assertions.assertTrue(stackTraceToString(e).contains("Property hoodie.datasource.write.recordkey.field not found")); } } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java index 75dca2e8d0b4c..0fc90c83a08d4 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java @@ -34,7 +34,7 @@ import java.util.stream.Stream; -import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; public class TestSimpleKeyGenerator extends KeyGeneratorTestUtilities { private TypedProperties getCommonProps() { @@ -108,9 +108,9 @@ public void testWrongRecordKeyField() { public void testWrongPartitionPathField() { SimpleKeyGenerator keyGenerator = new SimpleKeyGenerator(getWrongPartitionPathFieldProps()); GenericRecord record = getRecord(); - Assertions.assertEquals(keyGenerator.getPartitionPath(record), KeyGenUtils.DEFAULT_PARTITION_PATH); + Assertions.assertEquals(keyGenerator.getPartitionPath(record), KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH); Assertions.assertEquals(keyGenerator.getPartitionPath(KeyGeneratorTestUtilities.getRow(record)), - KeyGenUtils.DEFAULT_PARTITION_PATH); + KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH); } @Test @@ -151,7 +151,7 @@ public void testNestedPartitionPathField(GenericRecord nestedColRecord) { partitionPathFieldValue = (String) nestedColRecord.get("prop1"); } String expectedPartitionPath = "nested_col.prop1=" - + (partitionPathFieldValue != null && !partitionPathFieldValue.isEmpty() ? partitionPathFieldValue : DEFAULT_PARTITION_PATH); + + (partitionPathFieldValue != null && !partitionPathFieldValue.isEmpty() ? partitionPathFieldValue : HUDI_DEFAULT_PARTITION_PATH); HoodieKey key = keyGenerator.getKey(record); Assertions.assertEquals("key1", key.getRecordKey()); Assertions.assertEquals(expectedPartitionPath, key.getPartitionPath()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java index 87dd26f0c6428..cb468e903e59b 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java @@ -63,6 +63,7 @@ import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator; import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler; import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.CleanerUtils; @@ -75,7 +76,9 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.action.clean.CleanPlanner; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.log4j.LogManager; @@ -166,8 +169,8 @@ private Pair> insertFirstBigBatchForClientCleanerTe assertTrue(table.getCompletedCleanTimeline().empty()); if (client.getConfig().shouldAutoCommit()) { - HoodieIndex index = SparkHoodieIndex.createIndex(cfg); - List taggedRecords = ((JavaRDD) index.tagLocation(jsc.parallelize(records, 1), context, table)).collect(); + HoodieIndex index = SparkHoodieIndexFactory.createIndex(cfg); + List taggedRecords = tagLocation(index, jsc.parallelize(records, 1), table).collect(); checkTaggedRecords(taggedRecords, newCommitTime); } return Pair.of(newCommitTime, statuses); @@ -266,6 +269,7 @@ private void testInsertAndCleanByVersions( .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { @@ -298,7 +302,7 @@ private void testInsertAndCleanByVersions( .map(e -> Pair.of(e.getKey().getPartitionPath(), e.getValue())).collect(Collectors.toList()); HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicePairs, Option.empty(), Option.empty()); - List instantTimes = makeIncrementalCommitTimes(9); + List instantTimes = makeIncrementalCommitTimes(9, 1, 10); String compactionTime = instantTimes.get(0); table.getActiveTimeline().saveToCompactionRequested( new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime), @@ -435,6 +439,7 @@ private void testInsertAndCleanByCommits( .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); @@ -511,6 +516,7 @@ private void testFailedInsertAndCleanByCommits( .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); @@ -632,29 +638,38 @@ private List runCleaner(HoodieWriteConfig config, boolean simul } /** - * Test HoodieTable.clean() Cleaning by versions logic. + * Test Hudi COW Table Cleaner - Keep the latest file versions policy. */ @ParameterizedTest @ValueSource(booleans = {false, true}) public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) .build(); - HoodieTestTable testTable = HoodieTestTable.of(metaClient); - String p0 = "2020/01/01"; - String p1 = "2020/01/02"; - Map> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null; + + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + + final String p0 = "2020/01/01"; + final String p1 = "2020/01/02"; + final Map> bootstrapMapping = enableBootstrapSourceClean + ? generateBootstrapIndexAndSourceData(p0, p1) : null; // make 1 commit, with 1 file per partition - String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() + final String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString(); - String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() + final String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString(); - testTable.addCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + + Map>> c1PartitionToFilesNameLengthMap = new HashMap<>(); + c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100))); + c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200))); + testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1), + c1PartitionToFilesNameLengthMap, false, false); List hoodieCleanStatsOne = runCleaner(config); assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files"); @@ -662,17 +677,21 @@ public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throw assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); // make next commit, with 1 insert & 1 update per partition - Map partitionAndFileId002 = testTable.addCommit("00000000000002") - .withBaseFilesInPartition(p0, file1P0C0) - .withBaseFilesInPartition(p1, file1P1C0) - .getFileIdsWithBaseFilesInPartitions(p0, p1); - + final String file2P0C1 = UUID.randomUUID().toString(); + final String file2P1C1 = UUID.randomUUID().toString(); + Map>> c2PartitionToFilesNameLengthMap = new HashMap<>(); + c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100))); + c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200))); + testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(), + c2PartitionToFilesNameLengthMap, false, false); + + // enableBootstrapSourceClean would delete the bootstrap base file at the same time List hoodieCleanStatsTwo = runCleaner(config, 1); - // enableBootstrapSourceClean would delete the bootstrap base file as the same time HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0); assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); + if (enableBootstrapSourceClean) { HoodieFileStatus fstatus = bootstrapMapping.get(p0).get(0).getBootstrapFileStatus(); @@ -683,9 +702,8 @@ public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throw assertFalse(Files.exists(Paths.get(bootstrapMapping.get( p0).get(0).getBootstrapFileStatus().getPath().getUri()))); } + cleanStat = getCleanStat(hoodieCleanStatsTwo, p1); - String file2P0C1 = partitionAndFileId002.get(p0); - String file2P1C1 = partitionAndFileId002.get(p1); assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1)); assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); @@ -693,6 +711,7 @@ public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throw assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); + if (enableBootstrapSourceClean) { HoodieFileStatus fstatus = bootstrapMapping.get(p1).get(0).getBootstrapFileStatus(); @@ -705,9 +724,13 @@ public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throw } // make next commit, with 2 updates to existing files, and 1 insert - String file3P0C2 = testTable.addCommit("00000000000003") - .withBaseFilesInPartition(p0, file1P0C0, file2P0C1) - .getFileIdsWithBaseFilesInPartitions(p0).get(p0); + final String file3P0C2 = UUID.randomUUID().toString(); + Map>> c3PartitionToFilesNameLengthMap = new HashMap<>(); + c3PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 102), Pair.of(file2P0C1, 101), + Pair.of(file3P0C2, 100))); + testTable.doWriteOperation("00000000000003", WriteOperationType.UPSERT, Collections.emptyList(), + c3PartitionToFilesNameLengthMap, false, false); + List hoodieCleanStatsThree = runCleaner(config, 3); assertEquals(2, getCleanStat(hoodieCleanStatsThree, p0) @@ -718,6 +741,7 @@ public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throw // No cleaning on partially written file, with no commit. testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2); + List hoodieCleanStatsFour = runCleaner(config); assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files"); assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); @@ -731,7 +755,7 @@ public void testKeepLatestFileVersionsMOR() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) .build(); @@ -769,7 +793,7 @@ public void testKeepLatestCommitsMOR() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()) .build(); @@ -809,7 +833,7 @@ public void testKeepLatestCommitsMOR() throws Exception { @Test public void testCleanWithReplaceCommits() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) .build(); @@ -893,7 +917,7 @@ public void testCleanWithReplaceCommits() throws Exception { String file4P1C4 = partitionAndFileId005.get(p1); replaceMetadata = generateReplaceCommitMetadata(p0, file3P1C2, file4P1C4); testTable.addReplaceCommit("00000000000005", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); - + List hoodieCleanStatsFive = runCleaner(config, 2); assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3)); assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); @@ -901,7 +925,7 @@ public void testCleanWithReplaceCommits() throws Exception { assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); } - + private Pair generateReplaceCommitMetadata(String partition, String replacedFileId, String newFileId) { @@ -918,7 +942,7 @@ private Pair genera .setVersion(1).setExtraMetadata(Collections.emptyMap()) .setStrategy(HoodieClusteringStrategy.newBuilder().setStrategyClassName("").setVersion(1).build()) .setInputGroups(clusteringGroups).build()); - + HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); replaceMetadata.addReplaceFileId(partition, replacedFileId); replaceMetadata.setOperationType(WriteOperationType.CLUSTER); @@ -1116,7 +1140,7 @@ private static Stream argumentsForTestKeepLatestCommits() { @MethodSource("argumentsForTestKeepLatestCommits") public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withIncrementalCleaningMode(enableIncrementalClean) .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) @@ -1252,7 +1276,7 @@ private Map> generateBootstrapIndexAndSourceD assertTrue(new File(sourcePath.toString()).exists()); // recreate metaClient with Bootstrap base path - metaClient = HoodieTestUtils.init(basePath, getTableType(), sourcePath.toString()); + metaClient = HoodieTestUtils.init(basePath, getTableType(), sourcePath.toString(), true); // generate bootstrap index Map> bootstrapMapping = TestBootstrapIndex.generateBootstrapIndex(metaClient, sourcePath.toString(), @@ -1282,7 +1306,9 @@ public void testCleanMarkerDataFilesOnRollback() throws Exception { table.getActiveTimeline().transitionRequestedToInflight( new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "000"), Option.empty()); metaClient.reloadActiveTimeline(); - table.rollback(context, "001", new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "000"), true); + HoodieInstant rollbackInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "000"); + table.scheduleRollback(context, "001", rollbackInstant, false); + table.rollback(context, "001", rollbackInstant, true, false); final int numTempFilesAfter = testTable.listAllFilesInTempFolder().length; assertEquals(0, numTempFilesAfter, "All temp files are deleted."); } @@ -1293,7 +1319,7 @@ public void testCleanMarkerDataFilesOnRollback() throws Exception { @Test public void testCleaningWithZeroPartitionPaths() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) .build(); @@ -1301,7 +1327,9 @@ public void testCleaningWithZeroPartitionPaths() throws Exception { // Make a commit, although there are no partitionPaths. // Example use-case of this is when a client wants to create a table // with just some commit metadata, but no data/partitionPaths. - HoodieTestTable.of(metaClient).addCommit("000"); + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + testTable.doWriteOperation("001", WriteOperationType.INSERT, Collections.emptyList(), 1); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -1315,7 +1343,7 @@ public void testCleaningWithZeroPartitionPaths() throws Exception { @Test public void testKeepLatestCommitsWithPendingCompactions() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) .build(); @@ -1339,7 +1367,7 @@ public void testKeepLatestCommitsWithPendingCompactions() throws Exception { public void testKeepLatestVersionsWithPendingCompactions(boolean retryFailure) throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()) .build(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java index f0046afe03a70..595d4df2a93a5 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -193,7 +194,7 @@ public void testUpsertPartitioner(boolean populateMetaFields) throws Exception { @ValueSource(booleans = {true, false}) public void testLogFileCountsAfterCompaction(boolean populateMetaFields) throws Exception { // insert 100 records - HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true); + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()); addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); HoodieWriteConfig config = cfgBuilder.build(); @@ -524,6 +525,7 @@ public void testHandleUpdateWithMultiplePartitions() throws Exception { JavaRDD deleteRDD = jsc().parallelize(fewRecordsForDelete, 1); // initialize partitioner + hoodieTable.getHoodieView().sync(); AbstractSparkDeltaCommitActionExecutor actionExecutor = new SparkDeleteDeltaCommitActionExecutor(context(), cfg, hoodieTable, newDeleteTime, deleteRDD); actionExecutor.getUpsertPartitioner(new WorkloadProfile(buildProfile(deleteRDD))); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java index d9dc6ac978d92..8617c848729c2 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java @@ -23,7 +23,8 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.index.bloom.HoodieBloomIndex; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -64,13 +65,20 @@ private enum CombineTestMode { private static final boolean WITHOUT_COMBINE = false; private static final int DELETE_PARALLELISM = 200; - @Mock private SparkHoodieBloomIndex index; - @Mock private HoodieTable, JavaRDD, JavaRDD> table; - @Mock private BaseSparkCommitActionExecutor executor; - @Mock private HoodieWriteMetadata metadata; - @Mock private JavaPairRDD keyPairs; - @Mock private JavaSparkContext jsc; - @Mock private HoodieSparkEngineContext context; + @Mock + private HoodieBloomIndex index; + @Mock + private HoodieTable, JavaRDD, JavaRDD> table; + @Mock + private BaseSparkCommitActionExecutor executor; + @Mock + private HoodieWriteMetadata metadata; + @Mock + private JavaPairRDD keyPairs; + @Mock + private JavaSparkContext jsc; + @Mock + private HoodieSparkEngineContext context; private JavaRDD rddToDelete; private HoodieWriteConfig config; @@ -149,7 +157,7 @@ private JavaRDD newHoodieKeysRddMock(int howMany, CombineTestMode com JavaRDD recordsRdd = mock(JavaRDD.class); when(recordsRdd.filter(any())).thenReturn(recordsRdd); when(recordsRdd.isEmpty()).thenReturn(howMany <= 0); - when(index.tagLocation(any(), any(), any())).thenReturn(recordsRdd); + when(index.tagLocation(any(), any(), any())).thenReturn(HoodieJavaRDD.of(recordsRdd)); if (combineMode == CombineTestMode.GlobalIndex) { when(keyPairs.reduceByKey(any(), anyInt())).thenReturn(keyPairs); @@ -175,7 +183,7 @@ private JavaRDD mockEmptyHoodieKeyRdd() { doReturn(Collections.emptyList()).when(emptyRdd).partitions(); doReturn(emptyRdd).when(emptyRdd).map(any()); - doReturn(emptyRdd).when(index).tagLocation(any(), any(), any()); + doReturn(HoodieJavaRDD.of(emptyRdd)).when(index).tagLocation(any(), any(), any()); doReturn(emptyRdd).when(emptyRdd).filter(any()); doNothing().when(executor).saveWorkloadProfileMetadataToInflight(any(), anyString()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java index 3a125d2302c19..7b5cc27d37280 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -217,7 +218,7 @@ public void testPartitionWeight() throws Exception { final String testPartitionPath = "2016/09/26"; int totalInsertNum = 2000; - HoodieWriteConfig config = makeHoodieClientConfigBuilder() + HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(0) .insertSplitSize(totalInsertNum / 2).autoTuneInsertSplits(false).build()).build(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java index 4c9ab3dc9d11d..c3f4395b5a81a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java @@ -46,6 +46,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hudi.testutils.HoodieClientTestUtils; @@ -175,6 +176,7 @@ protected void executeCompaction(String compactionInstantTime, SparkRDDWriteClie HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException { client.compact(compactionInstantTime); + assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, compactionInstantTime).doesMarkerDirExist()); List fileSliceList = getCurrentLatestFileSlices(table); assertTrue(fileSliceList.stream().findAny().isPresent(), "Ensure latest file-slices are not empty"); assertFalse(fileSliceList.stream() diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java index 79c415a4bc268..c2879fb1aaf4c 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java @@ -52,7 +52,7 @@ public class TestAsyncCompaction extends CompactionTestBase { private HoodieWriteConfig getConfig(Boolean autoCommit) { return getConfigBuilder(autoCommit) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).validate(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); } @@ -89,8 +89,8 @@ public void testRollbackForInflightCompaction() throws Exception { metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - client.rollbackInflightCompaction( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionInstantTime), hoodieTable); + hoodieTable.rollbackInflightCompaction( + new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionInstantTime)); metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); pendingCompactionInstant = metaClient.getCommitsAndCompactionTimeline().filterPendingCompactionTimeline() .getInstants().findFirst().get(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java index 14e26b9d4dcbf..6b837e3178ee0 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieRecord; @@ -40,7 +41,8 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; @@ -159,7 +161,10 @@ public void testScheduleCompactionWithInflightInstant() { @Test public void testWriteStatusContentsAfterCompaction() throws Exception { // insert 100 records - HoodieWriteConfig config = getConfig(); + HoodieWriteConfig config = getConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) { String newCommitTime = "100"; writeClient.startCommitWithTime(newCommitTime); @@ -175,8 +180,8 @@ public void testWriteStatusContentsAfterCompaction() throws Exception { List updatedRecords = dataGen.generateUpdates(newCommitTime, records); JavaRDD updatedRecordsRDD = jsc.parallelize(updatedRecords, 1); - HoodieIndex index = new SparkHoodieBloomIndex<>(config); - updatedRecords = ((JavaRDD)index.tagLocation(updatedRecordsRDD, context, table)).collect(); + HoodieIndex index = new HoodieBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance()); + updatedRecords = tagLocation(index, updatedRecordsRDD, table).collect(); // Write them to corresponding avro logfiles. Also, set the state transition properly. HoodieSparkWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS) @@ -204,7 +209,8 @@ public void testWriteStatusContentsAfterCompaction() throws Exception { String compactionInstantTime = "102"; table.scheduleCompaction(context, compactionInstantTime, Option.empty()); table.getMetaClient().reloadActiveTimeline(); - JavaRDD result = (JavaRDD) table.compact(context, compactionInstantTime).getWriteStatuses(); + JavaRDD result = (JavaRDD) table.compact( + context, compactionInstantTime).getWriteStatuses(); // Verify that all partition paths are present in the WriteStatus result for (String partitionPath : dataGen.getPartitionPaths()) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java index 823d651aa1589..ef52953a2f0c8 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java @@ -26,6 +26,8 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.marker.WriteMarkersFactory; import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -35,6 +37,7 @@ import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; public class TestInlineCompaction extends CompactionTestBase { @@ -84,6 +87,8 @@ public void testSuccessfulCompactionBasedOnNumCommits() throws Exception { metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); assertEquals(4, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); assertEquals(HoodieTimeline.COMMIT_ACTION, metaClient.getActiveTimeline().lastInstant().get().getAction()); + String compactionTime = metaClient.getActiveTimeline().lastInstant().get().getTimestamp(); + assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), HoodieSparkTable.create(cfg, context), compactionTime).doesMarkerDirExist()); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java index bc1f3c3885028..3225dcd04ea3f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.rollback; import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.common.HoodieRollbackStat; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroup; @@ -79,13 +80,17 @@ public void testCopyOnWriteRollbackActionExecutorForFileListingAsGenerateFile() .withBaseFilesInPartition(p1, "id21") .withBaseFilesInPartition(p2, "id22"); - HoodieTable table = this.getHoodieTable(metaClient, getConfig()); + HoodieWriteConfig writeConfig = getConfigBuilder().withRollbackUsingMarkers(false).build(); + HoodieTable table = this.getHoodieTable(metaClient, writeConfig); HoodieInstant needRollBackInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "002"); // execute CopyOnWriteRollbackActionExecutor with filelisting mode - SparkCopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new SparkCopyOnWriteRollbackActionExecutor(context, table.getConfig(), table, "003", needRollBackInstant, true); - assertFalse(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - List hoodieRollbackStats = copyOnWriteRollbackActionExecutor.executeRollback(); + BaseRollbackPlanActionExecutor copyOnWriteRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, table.getConfig(), table, "003", needRollBackInstant, false); + HoodieRollbackPlan rollbackPlan = (HoodieRollbackPlan) copyOnWriteRollbackPlanActionExecutor.execute().get(); + CopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new CopyOnWriteRollbackActionExecutor(context, table.getConfig(), table, "003", needRollBackInstant, true, + false); + List hoodieRollbackStats = copyOnWriteRollbackActionExecutor.executeRollback(rollbackPlan); // assert hoodieRollbackStats assertEquals(hoodieRollbackStats.size(), 3); @@ -96,14 +101,14 @@ public void testCopyOnWriteRollbackActionExecutorForFileListingAsGenerateFile() assertEquals(0, stat.getFailedDeleteFiles().size()); assertEquals(Collections.EMPTY_MAP, stat.getCommandBlocksCount()); assertEquals(testTable.forCommit("002").getBaseFilePath(p1, "id21").toString(), - stat.getSuccessDeleteFiles().get(0)); + this.fs.getScheme() + ":" + stat.getSuccessDeleteFiles().get(0)); break; case p2: assertEquals(1, stat.getSuccessDeleteFiles().size()); assertEquals(0, stat.getFailedDeleteFiles().size()); assertEquals(Collections.EMPTY_MAP, stat.getCommandBlocksCount()); assertEquals(testTable.forCommit("002").getBaseFilePath(p2, "id22").toString(), - stat.getSuccessDeleteFiles().get(0)); + this.fs.getScheme() + ":" + stat.getSuccessDeleteFiles().get(0)); break; case p3: assertEquals(0, stat.getSuccessDeleteFiles().size()); @@ -150,7 +155,7 @@ public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws HoodieTable table = this.getHoodieTable(metaClient, cfg); performRollbackAndValidate(isUsingMarkers, cfg, table, firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices); } - + private void performRollbackAndValidate(boolean isUsingMarkers, HoodieWriteConfig cfg, HoodieTable table, List firstPartitionCommit2FileSlices, List secondPartitionCommit2FileSlices) throws IOException { @@ -162,12 +167,11 @@ private void performRollbackAndValidate(boolean isUsingMarkers, HoodieWriteConfi commitInstant = table.getCompletedCommitTimeline().lastInstant().get(); } - SparkCopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new SparkCopyOnWriteRollbackActionExecutor(context, cfg, table, "003", commitInstant, false); - if (!isUsingMarkers) { - assertFalse(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } else { - assertTrue(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } + BaseRollbackPlanActionExecutor copyOnWriteRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, table.getConfig(), table, "003", commitInstant, false); + HoodieRollbackPlan hoodieRollbackPlan = (HoodieRollbackPlan) copyOnWriteRollbackPlanActionExecutor.execute().get(); + CopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new CopyOnWriteRollbackActionExecutor(context, cfg, table, "003", commitInstant, false, + false); Map rollbackMetadata = copyOnWriteRollbackActionExecutor.execute().getPartitionMetadata(); //3. assert the rollback stat @@ -175,9 +179,9 @@ private void performRollbackAndValidate(boolean isUsingMarkers, HoodieWriteConfi for (Map.Entry entry : rollbackMetadata.entrySet()) { HoodieRollbackPartitionMetadata meta = entry.getValue(); assertTrue(meta.getFailedDeleteFiles() == null - || meta.getFailedDeleteFiles().size() == 0); + || meta.getFailedDeleteFiles().size() == 0); assertTrue(meta.getSuccessDeleteFiles() == null - || meta.getSuccessDeleteFiles().size() == 1); + || meta.getSuccessDeleteFiles().size() == 1); } //4. assert filegroup after rollback, and compare to the rollbackstat @@ -187,15 +191,11 @@ private void performRollbackAndValidate(boolean isUsingMarkers, HoodieWriteConfi List firstPartitionRollBack1FileSlices = firstPartitionRollBack1FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()); assertEquals(1, firstPartitionRollBack1FileSlices.size()); - if (!isUsingMarkers) { - firstPartitionCommit2FileSlices.removeAll(firstPartitionRollBack1FileSlices); - assertEquals(1, firstPartitionCommit2FileSlices.size()); - assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0)); - } else { - assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - String.format("%s:%s/%s", this.fs.getScheme(), basePath, rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0))); - } + firstPartitionCommit2FileSlices.removeAll(firstPartitionRollBack1FileSlices); + assertEquals(1, firstPartitionCommit2FileSlices.size()); + assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + this.fs.getScheme() + ":" + rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0)); + // assert the second partition file group and file slice List secondPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); @@ -204,15 +204,10 @@ private void performRollbackAndValidate(boolean isUsingMarkers, HoodieWriteConfi assertEquals(1, secondPartitionRollBack1FileSlices.size()); // assert the second partition rollback file is equals rollBack1SecondPartitionStat - if (!isUsingMarkers) { - secondPartitionCommit2FileSlices.removeAll(secondPartitionRollBack1FileSlices); - assertEquals(1, secondPartitionCommit2FileSlices.size()); - assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0)); - } else { - assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - String.format("%s:%s/%s", this.fs.getScheme(), basePath, rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0))); - } + secondPartitionCommit2FileSlices.removeAll(secondPartitionRollBack1FileSlices); + assertEquals(1, secondPartitionCommit2FileSlices.size()); + assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + this.fs.getScheme() + ":" + rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0)); assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, commitInstant.getTimestamp()).doesMarkerDirExist()); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java index 75e6a7ac0b703..38be873e57ad6 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieLogFile; @@ -61,7 +62,7 @@ public void setUp() throws Exception { initPath(); initSparkContexts(); //just generate tow partitions - dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); + dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); initFileSystem(); initMetaClient(); } @@ -89,20 +90,17 @@ public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws //2. rollback HoodieInstant rollBackInstant = new HoodieInstant(isUsingMarkers, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); - SparkMergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new SparkMergeOnReadRollbackActionExecutor( + BaseRollbackPlanActionExecutor mergeOnReadRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, cfg, table, "003", rollBackInstant, false); + mergeOnReadRollbackPlanActionExecutor.execute().get(); + MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor( context, cfg, table, "003", rollBackInstant, - true); - // assert is filelist mode - if (!isUsingMarkers) { - assertFalse(mergeOnReadRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } else { - assertTrue(mergeOnReadRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } - + true, + false); //3. assert the rollback stat Map rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata(); assertEquals(2, rollbackMetadata.size()); @@ -145,15 +143,14 @@ public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws public void testFailForCompletedInstants() { Assertions.assertThrows(IllegalArgumentException.class, () -> { HoodieInstant rollBackInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); - new SparkMergeOnReadRollbackActionExecutor( - context, - getConfigBuilder().build(), - getHoodieTable(metaClient, getConfigBuilder().build()), - "003", - rollBackInstant, - true, - true, - true); + new MergeOnReadRollbackActionExecutor(context, getConfigBuilder().build(), + getHoodieTable(metaClient, getConfigBuilder().build()), + "003", + rollBackInstant, + true, + true, + true, + false).execute(); }); } @@ -163,7 +160,7 @@ public void testFailForCompletedInstants() { @Test public void testRollbackWhenFirstCommitFail() throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()).build(); try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { client.startCommitWithTime("001"); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java index 03dd3b055f342..a22a04075ea80 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java @@ -131,6 +131,7 @@ void testClustering(boolean doUpdates, boolean populateMetaFields, boolean prese } HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + hoodieTable.getHoodieView().sync(); FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); // expect 2 base files for each partition assertEquals(dataGen.getPartitionPaths().length * 2, allFiles.length); @@ -146,6 +147,7 @@ void testClustering(boolean doUpdates, boolean populateMetaFields, boolean prese metaClient = HoodieTableMetaClient.reload(metaClient); final HoodieTable clusteredTable = HoodieSparkTable.create(cfg, context(), metaClient); + clusteredTable.getHoodieView().sync(); Stream dataFilesToRead = Arrays.stream(dataGen.getPartitionPaths()) .flatMap(p -> clusteredTable.getBaseFileOnlyView().getLatestBaseFiles(p)); // verify there should be only one base file per partition after clustering. diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java index e7e7074094250..fd2f63a26c638 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java @@ -20,6 +20,7 @@ package org.apache.hudi.table.functional; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -83,7 +84,7 @@ public void testIncrementalReadsWithCompaction() throws Exception { Properties props = new Properties(); props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieFileFormat.PARQUET.toString()); HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props); - HoodieWriteConfig cfg = getConfig(true); + HoodieWriteConfig cfg = getConfigBuilder(true).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()).build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { /* diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java index caecbef52031a..fb44c14f59ad2 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java @@ -109,6 +109,7 @@ public void testSimpleInsertAndUpdate(HoodieFileFormat fileFormat, boolean popul client.compact(compactionCommitTime); HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + hoodieTable.getHoodieView().sync(); FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); Stream dataFilesToRead = tableView.getLatestBaseFiles(); @@ -238,6 +239,7 @@ public void testSimpleInsertsGeneratedIntoLogFiles() throws Exception { writeClient.commit(newCommitTime, statuses); HoodieTable table = HoodieSparkTable.create(config, context(), metaClient); + table.getHoodieView().sync(); TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView(); long numLogFiles = 0; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java index 7ab5a80e5f446..6bbb0f655bb8e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroup; @@ -51,9 +52,8 @@ import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; import java.io.File; @@ -136,23 +136,14 @@ void testCOWToMORConvertedTableRollback(boolean rollbackUsingMarkers) throws Exc } } - private static Stream testRollbackWithDeltaAndCompactionCommit() { - return Stream.of( - Arguments.of(true, true), - Arguments.of(true, false), - Arguments.of(false, true), - Arguments.of(false, false) - ); - } - @ParameterizedTest - @MethodSource - void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers, boolean populateMetaFields) throws Exception { + @ValueSource(booleans = {true, false}) + void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) throws Exception { HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.SIMPLE); - addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + addConfigsForPopulateMetaFields(cfgBuilder, true); HoodieWriteConfig cfg = cfgBuilder.build(); - Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen(); + Properties properties = new Properties(); properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); @@ -288,8 +279,8 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers, bool final String compactedCommitTime = metaClient.getActiveTimeline().reload().lastInstant().get().getTimestamp(); assertTrue(Arrays.stream(listAllBaseFilesInPath(hoodieTable)) .anyMatch(file -> compactedCommitTime.equals(new HoodieBaseFile(file).getCommitTime()))); - thirdClient.rollbackInflightCompaction(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime), - hoodieTable); + hoodieTable.rollbackInflightCompaction(new HoodieInstant( + HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime)); allFiles = listAllBaseFilesInPath(hoodieTable); metaClient = HoodieTableMetaClient.reload(metaClient); tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); @@ -300,10 +291,10 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers, bool } } - @ParameterizedTest - @ValueSource(booleans = {true, false}) - void testMultiRollbackWithDeltaAndCompactionCommit(boolean populateMetaFields) throws Exception { - HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false); + @Test + void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { + boolean populateMetaFields = true; + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()); addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); HoodieWriteConfig cfg = cfgBuilder.build(); @@ -352,7 +343,9 @@ void testMultiRollbackWithDeltaAndCompactionCommit(boolean populateMetaFields) t */ newCommitTime = "002"; // WriteClient with custom config (disable small file handling) - try (SparkRDDWriteClient nClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(populateMetaFields))) { + HoodieWriteConfig smallFileWriteConfig = getHoodieWriteConfigWithSmallFileHandlingOffBuilder(populateMetaFields) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()).build(); + try (SparkRDDWriteClient nClient = getHoodieWriteClient(smallFileWriteConfig)) { nClient.startCommitWithTime(newCommitTime); List copyOfRecords = new ArrayList<>(records); @@ -463,6 +456,10 @@ void testMultiRollbackWithDeltaAndCompactionCommit(boolean populateMetaFields) t } private HoodieWriteConfig getHoodieWriteConfigWithSmallFileHandlingOff(boolean populateMetaFields) { + return getHoodieWriteConfigWithSmallFileHandlingOffBuilder(populateMetaFields).build(); + } + + private HoodieWriteConfig.Builder getHoodieWriteConfigWithSmallFileHandlingOffBuilder(boolean populateMetaFields) { HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withDeleteParallelism(2) .withAutoCommit(false) @@ -474,7 +471,7 @@ private HoodieWriteConfig getHoodieWriteConfigWithSmallFileHandlingOff(boolean p if (!populateMetaFields) { addConfigsForPopulateMetaFields(cfgBuilder, false); } - return cfgBuilder.build(); + return cfgBuilder; } @ParameterizedTest @@ -592,6 +589,7 @@ void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(boolean rollbackUsi metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieSparkTable.create(config, context(), metaClient); + table.getHoodieView().sync(); TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView(); long numLogFiles = 0; @@ -613,7 +611,8 @@ void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(boolean rollbackUsi //writeClient.commitCompaction(newCommitTime, statuses, Option.empty()); // Trigger a rollback of compaction table.getActiveTimeline().reload(); - writeClient.rollbackInflightCompaction(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, newCommitTime), table); + table.rollbackInflightCompaction(new HoodieInstant( + HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, newCommitTime)); metaClient = HoodieTableMetaClient.reload(metaClient); table = HoodieSparkTable.create(config, context(), metaClient); @@ -621,7 +620,7 @@ void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(boolean rollbackUsi ((SyncableFileSystemView) tableRTFileSystemView).reset(); for (String partitionPath : dataGen.getPartitionPaths()) { - List fileSlices = getFileSystemViewWithUnCommittedSlices(metaClient) + List fileSlices = getFileSystemViewWithUnCommittedSlices(metaClient) .getAllFileSlices(partitionPath).filter(fs -> fs.getBaseInstantTime().equals("100")).collect(Collectors.toList()); assertTrue(fileSlices.stream().noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); assertTrue(fileSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java index 6e6738653972b..8b23cf25768e3 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.functional; +import org.apache.hudi.avro.model.HoodieRollbackRequest; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; @@ -32,7 +33,9 @@ import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.action.rollback.SparkMarkerBasedRollbackStrategy; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.rollback.BaseRollbackHelper; +import org.apache.hudi.table.action.rollback.MarkerBasedRollbackStrategy; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hadoop.fs.FileStatus; @@ -93,8 +96,13 @@ public void testCopyOnWriteRollbackWithTestTable() throws Exception { .withMarkerFile("partA", f2, IOType.CREATE); // when - List stats = new SparkMarkerBasedRollbackStrategy(HoodieSparkTable.create(getConfig(), context, metaClient), context, getConfig(), "002") - .execute(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001")); + HoodieTable hoodieTable = HoodieSparkTable.create(getConfig(), context, metaClient); + List rollbackRequests = new MarkerBasedRollbackStrategy(hoodieTable, context, getConfig(), + "002").getRollbackRequests(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001")); + + List stats = new BaseRollbackHelper(hoodieTable.getMetaClient(), getConfig()).performRollback(context, + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001"), + rollbackRequests); // then: ensure files are deleted correctly, non-existent files reported as failed deletes assertEquals(2, stats.size()); @@ -175,9 +183,14 @@ private List testRun(boolean useFileListingMetadata, HoodieW writeStatuses = writeClient.upsert(jsc.parallelize(records, 1), newCommitTime); writeStatuses.collect(); + HoodieTable hoodieTable = HoodieSparkTable.create(getConfig(), context, metaClient); + List rollbackRequests = new MarkerBasedRollbackStrategy(hoodieTable, context, getConfig(), + "003").getRollbackRequests(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002")); + // rollback 2nd commit and ensure stats reflect the info. - return new SparkMarkerBasedRollbackStrategy(HoodieSparkTable.create(writeConfig, context, metaClient), context, writeConfig, "003") - .execute(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002")); + return new BaseRollbackHelper(hoodieTable.getMetaClient(), getConfig()).performRollback(context, + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002"), + rollbackRequests); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java index 792da4e086199..19ec4e6d0654c 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java @@ -85,7 +85,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; /** - * Unit tests {@link SparkUpgradeDowngrade}. + * Unit tests {@link UpgradeDowngrade}. */ public class TestUpgradeDowngrade extends HoodieClientTestBase { @@ -177,7 +177,8 @@ public void testUpgradeZeroToOneInternal(boolean induceResiduesFromPrevUpgrade, } // should re-create marker files for 2nd commit since its pending. - new SparkUpgradeDowngrade(metaClient, cfg, context).run(metaClient, HoodieTableVersion.ONE, cfg, context, null); + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.ONE, null); // assert marker files assertMarkerFilesForUpgrade(table, commitInstant, firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices); @@ -218,7 +219,8 @@ public void testUpgradeOneToTwo(HoodieTableType tableType) throws IOException { downgradeTableConfigsFromTwoToOne(cfg); // perform upgrade - new SparkUpgradeDowngrade(metaClient, cfg, context).run(metaClient, HoodieTableVersion.TWO, cfg, context, null); + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.TWO, null); // verify hoodie.table.version got upgraded metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()) @@ -321,7 +323,8 @@ public void testDowngrade(boolean deletePartialMarkerFiles, HoodieTableType tabl } // downgrade should be performed. all marker files should be deleted - new SparkUpgradeDowngrade(metaClient, cfg, context).run(metaClient, toVersion, cfg, context, null); + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(toVersion, null); // assert marker files assertMarkerFilesForDowngrade(table, commitInstant, toVersion == HoodieTableVersion.ONE); @@ -557,7 +560,7 @@ private void prepForDowngradeFromTwoToOne() throws IOException { private void createResidualFile() throws IOException { Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); - Path updatedPropertyFile = new Path(metaClient.getMetaPath() + "/" + SparkUpgradeDowngrade.HOODIE_UPDATED_PROPERTY_FILE); + Path updatedPropertyFile = new Path(metaClient.getMetaPath() + "/" + UpgradeDowngrade.HOODIE_UPDATED_PROPERTY_FILE); // Step1: Copy hoodie.properties to hoodie.properties.orig FileUtil.copy(metaClient.getFs(), propertyFile, metaClient.getFs(), updatedPropertyFile, diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java index aa8814ad67c1b..0a010dde5b63a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java @@ -41,8 +41,9 @@ import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex.IndexType; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hadoop.fs.FileSystem; @@ -234,11 +235,11 @@ public void assertNodupesWithinPartition(List> private Function2, String, Integer> wrapRecordsGenFunctionForPreppedCalls( final HoodieWriteConfig writeConfig, final Function2, String, Integer> recordGenFunction) { return (commit, numRecords) -> { - final SparkHoodieIndex index = SparkHoodieIndex.createIndex(writeConfig); + final HoodieIndex index = SparkHoodieIndexFactory.createIndex(writeConfig); List records = recordGenFunction.apply(commit, numRecords); final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); HoodieSparkTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - JavaRDD taggedRecords = index.tagLocation(jsc.parallelize(records, 1), context, table); + JavaRDD taggedRecords = tagLocation(index, jsc.parallelize(records, 1), table); return taggedRecords.collect(); }; } @@ -255,13 +256,13 @@ private Function2, String, Integer> wrapRecordsGenFunctionFor private Function> wrapDeleteKeysGenFunctionForPreppedCalls( final HoodieWriteConfig writeConfig, final Function> keyGenFunction) { return (numRecords) -> { - final SparkHoodieIndex index = SparkHoodieIndex.createIndex(writeConfig); + final HoodieIndex index = SparkHoodieIndexFactory.createIndex(writeConfig); List records = keyGenFunction.apply(numRecords); final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); HoodieSparkTable table = HoodieSparkTable.create(writeConfig, context, metaClient); JavaRDD recordsToDelete = jsc.parallelize(records, 1) .map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload())); - JavaRDD taggedRecords = index.tagLocation(recordsToDelete, context, table); + JavaRDD taggedRecords = tagLocation(index, recordsToDelete, table); return taggedRecords.map(record -> record.getKey()).collect(); }; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java index 1e52e449453df..9ed98b15cb7ab 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java @@ -21,7 +21,11 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieTableType; @@ -29,15 +33,28 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; +import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.metadata.FileSystemBackedTableMetadata; +import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadStat; import org.apache.hadoop.conf.Configuration; @@ -53,19 +70,31 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestInfo; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.stream.Collectors; import scala.Tuple2; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + /** * The test harness for resource initialization and cleanup. */ @@ -149,7 +178,7 @@ protected void initSparkContexts(String appName) { } /** - * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) + * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) * with a default name matching the name of the class. */ protected void initSparkContexts() { @@ -220,7 +249,15 @@ protected void initMetaClient() throws IOException { initMetaClient(getTableType()); } + protected void initMetaClient(Properties properties) throws IOException { + initMetaClient(getTableType(), properties); + } + protected void initMetaClient(HoodieTableType tableType) throws IOException { + initMetaClient(tableType, new Properties()); + } + + protected void initMetaClient(HoodieTableType tableType, Properties properties) throws IOException { if (basePath == null) { throw new IllegalStateException("The base path has not been initialized."); } @@ -229,7 +266,10 @@ protected void initMetaClient(HoodieTableType tableType) throws IOException { throw new IllegalStateException("The Spark context has not been initialized."); } - metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); + if (tableName != null && !tableName.isEmpty()) { + properties.put(HoodieTableConfig.NAME.key(), tableName); + } + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType, properties); } protected Properties getPropertiesForKeyGen() { @@ -376,15 +416,21 @@ public HoodieTableMetaClient getHoodieMetaClient(Configuration conf, String base } public HoodieTableFileSystemView getHoodieTableFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, - FileStatus[] fileStatuses) { + FileStatus[] fileStatuses) { if (tableView == null) { - tableView = new HoodieTableFileSystemView(metaClient, visibleActiveTimeline, fileStatuses); + tableView = new HoodieTableFileSystemView(metaClient, visibleActiveTimeline, fileStatuses); } else { tableView.init(metaClient, visibleActiveTimeline, fileStatuses); } return tableView; } + public JavaRDD tagLocation( + HoodieIndex index, JavaRDD records, HoodieTable table) { + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(records), context, table)); + } + public static Pair, WorkloadStat> buildProfile(JavaRDD inputRecordsRDD) { HashMap partitionPathStatMap = new HashMap<>(); WorkloadStat globalStat = new WorkloadStat(); @@ -418,4 +464,176 @@ public static Pair, WorkloadStat> buildProfile(Jav } return Pair.of(partitionPathStatMap, globalStat); } + + /** + * Validate the metadata tables contents to ensure it matches what is on the file system. + */ + public void validateMetadata(HoodieTestTable testTable, List inflightCommits, HoodieWriteConfig writeConfig, + String metadataTableBasePath, boolean doFullValidation) throws IOException { + HoodieTableMetadata tableMetadata = metadata(writeConfig, context); + assertNotNull(tableMetadata, "MetadataReader should have been initialized"); + if (!writeConfig.isMetadataTableEnabled()) { + return; + } + + if (!tableMetadata.getSyncedInstantTime().isPresent() || tableMetadata instanceof FileSystemBackedTableMetadata) { + throw new IllegalStateException("Metadata should have synced some commits or tableMetadata should not be an instance " + + "of FileSystemBackedTableMetadata"); + } + assertEquals(inflightCommits, testTable.inflightCommits()); + + HoodieTimer timer = new HoodieTimer().startTimer(); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + // Partitions should match + List fsPartitionPaths = testTable.getAllPartitionPaths(); + List fsPartitions = new ArrayList<>(); + fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString())); + List metadataPartitions = tableMetadata.getAllPartitionPaths(); + + Collections.sort(fsPartitions); + Collections.sort(metadataPartitions); + + assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match"); + assertEquals(fsPartitions, metadataPartitions, "Partitions should match"); + + // Files within each partition should match + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieSparkTable.create(writeConfig, engineContext, true); + TableFileSystemView tableView = table.getHoodieView(); + List fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList()); + Map partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths); + assertEquals(fsPartitions.size(), partitionToFilesMap.size()); + + fsPartitions.forEach(partition -> { + try { + validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition); + } catch (IOException e) { + fail("Exception should not be raised: " + e); + } + }); + if (doFullValidation) { + runFullValidation(writeConfig, metadataTableBasePath, engineContext); + } + + LOG.info("Validation time=" + timer.endTimer()); + } + + public void syncTableMetadata(HoodieWriteConfig writeConfig) { + if (!writeConfig.getMetadataConfig().enabled()) { + return; + } + // Open up the metadata table again, for syncing + try (HoodieTableMetadataWriter writer = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, writeConfig, context)) { + LOG.info("Successfully synced to metadata table"); + } catch (Exception e) { + throw new HoodieMetadataException("Error syncing to metadata table.", e); + } + } + + public HoodieBackedTableMetadataWriter metadataWriter(HoodieWriteConfig clientConfig) { + return (HoodieBackedTableMetadataWriter) SparkHoodieBackedTableMetadataWriter + .create(hadoopConf, clientConfig, new HoodieSparkEngineContext(jsc)); + } + + public HoodieTableMetadata metadata(HoodieWriteConfig clientConfig, HoodieEngineContext hoodieEngineContext) { + return HoodieTableMetadata.create(hoodieEngineContext, clientConfig.getMetadataConfig(), clientConfig.getBasePath(), + clientConfig.getSpillableMapBasePath()); + } + + protected void validateFilesPerPartition(HoodieTestTable testTable, HoodieTableMetadata tableMetadata, TableFileSystemView tableView, + Map partitionToFilesMap, String partition) throws IOException { + Path partitionPath; + if (partition.equals("")) { + // Should be the non-partitioned case + partitionPath = new Path(basePath); + } else { + partitionPath = new Path(basePath, partition); + } + + FileStatus[] fsStatuses = testTable.listAllFilesInPartition(partition); + FileStatus[] metaStatuses = tableMetadata.getAllFilesInPartition(partitionPath); + List fsFileNames = Arrays.stream(fsStatuses) + .map(s -> s.getPath().getName()).collect(Collectors.toList()); + List metadataFilenames = Arrays.stream(metaStatuses) + .map(s -> s.getPath().getName()).collect(Collectors.toList()); + Collections.sort(fsFileNames); + Collections.sort(metadataFilenames); + + if ((fsFileNames.size() != metadataFilenames.size()) || (!fsFileNames.equals(metadataFilenames))) { + LOG.info("*** File system listing = " + Arrays.toString(fsFileNames.toArray())); + LOG.info("*** Metadata listing = " + Arrays.toString(metadataFilenames.toArray())); + + for (String fileName : fsFileNames) { + if (!metadataFilenames.contains(fileName)) { + LOG.error(partition + "FsFilename " + fileName + " not found in Meta data"); + } + } + for (String fileName : metadataFilenames) { + if (!fsFileNames.contains(fileName)) { + LOG.error(partition + "Metadata file " + fileName + " not found in original FS"); + } + } + } + assertEquals(fsStatuses.length, partitionToFilesMap.get(basePath + "/" + partition).length); + + // Block sizes should be valid + Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0)); + List fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList()); + List metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList()); + assertEquals(fsBlockSizes, metadataBlockSizes); + + assertEquals(fsFileNames.size(), metadataFilenames.size(), "Files within partition " + partition + " should match"); + assertEquals(fsFileNames, metadataFilenames, "Files within partition " + partition + " should match"); + + // FileSystemView should expose the same data + List fileGroups = tableView.getAllFileGroups(partition).collect(Collectors.toList()); + fileGroups.addAll(tableView.getAllReplacedFileGroups(partition).collect(Collectors.toList())); + + fileGroups.forEach(g -> LogManager.getLogger(getClass()).info(g)); + fileGroups.forEach(g -> g.getAllBaseFiles().forEach(b -> LogManager.getLogger(getClass()).info(b))); + fileGroups.forEach(g -> g.getAllFileSlices().forEach(s -> LogManager.getLogger(getClass()).info(s))); + + long numFiles = fileGroups.stream() + .mapToLong(g -> g.getAllBaseFiles().count() + g.getAllFileSlices().mapToLong(s -> s.getLogFiles().count()).sum()) + .sum(); + assertEquals(metadataFilenames.size(), numFiles); + } + + private void runFullValidation(HoodieWriteConfig writeConfig, String metadataTableBasePath, HoodieSparkEngineContext engineContext) { + HoodieBackedTableMetadataWriter metadataWriter = metadataWriter(writeConfig); + assertNotNull(metadataWriter, "MetadataWriter should have been initialized"); + + // Validate write config for metadata table + HoodieWriteConfig metadataWriteConfig = metadataWriter.getWriteConfig(); + assertFalse(metadataWriteConfig.isMetadataTableEnabled(), "No metadata table for metadata table"); + + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + + // Metadata table is MOR + assertEquals(metadataMetaClient.getTableType(), HoodieTableType.MERGE_ON_READ, "Metadata Table should be MOR"); + + // Metadata table is HFile format + assertEquals(metadataMetaClient.getTableConfig().getBaseFileFormat(), HoodieFileFormat.HFILE, + "Metadata Table base file format should be HFile"); + + // Metadata table has a fixed number of partitions + // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory + // in the .hoodie folder. + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), + false, false); + Assertions.assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size()); + + // Metadata table should automatically compact and clean + // versions are +1 as autoclean / compaction happens end of commits + int numFileVersions = metadataWriteConfig.getCleanerFileVersionsRetained() + 1; + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline()); + metadataTablePartitions.forEach(partition -> { + List latestSlices = fsView.getLatestFileSlices(partition).collect(Collectors.toList()); + assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).count() <= 1, "Should have a single latest base file"); + assertTrue(latestSlices.size() <= 1, "Should have a single latest file slice"); + assertTrue(latestSlices.size() <= numFileVersions, "Should limit file slice to " + + numFileVersions + " but was " + latestSlices.size()); + }); + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java index b8b4704348445..79fbdcaad93af 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java @@ -41,6 +41,7 @@ import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.SimpleKeyGenerator; import org.apache.hudi.table.HoodieSparkTable; @@ -51,6 +52,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -114,6 +116,10 @@ public Configuration hadoopConf() { return jsc.hadoopConfiguration(); } + public FileSystem fs() { + return FSUtils.getFs(basePath(), hadoopConf()); + } + @Override public HoodieSparkEngineContext context() { return context; @@ -171,14 +177,32 @@ public synchronized void runBeforeEach() { } } + /** + * To clean up Spark resources after all testcases have run in functional tests. + * + * Spark session and contexts were reused for testcases in the same test class. Some + * testcase may invoke this specifically to clean up in case of repeated test runs. + */ @AfterAll - public static synchronized void cleanUpAfterAll() { + public static synchronized void resetSpark() { if (spark != null) { spark.close(); spark = null; } } + protected JavaRDD tagLocation( + HoodieIndex index, JavaRDD records, HoodieTable table) { + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(records), context, table)); + } + + protected JavaRDD updateLocation( + HoodieIndex index, JavaRDD writeStatus, HoodieTable table) { + return HoodieJavaRDD.getJavaRDD( + index.updateLocation(HoodieJavaRDD.of(writeStatus), context, table)); + } + protected void insertRecords(HoodieTableMetaClient metaClient, List records, SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime) throws IOException { HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(metaClient); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java index be15dc85d0560..92b1f76ac4024 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java @@ -39,6 +39,8 @@ default SparkConf conf(Map overwritingConfigs) { SparkConf sparkConf = new SparkConf(); sparkConf.set("spark.app.name", getClass().getName()); sparkConf.set("spark.master", "local[*]"); + sparkConf.set("spark.default.parallelism", "4"); + sparkConf.set("spark.sql.shuffle.partitions", "4"); sparkConf.set("spark.driver.maxResultSize", "2g"); sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); @@ -52,4 +54,4 @@ default SparkConf conf(Map overwritingConfigs) { default SparkConf conf() { return conf(Collections.emptyMap()); } -} \ No newline at end of file +} diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index 71326e7504e62..bbf2b2972910d 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -248,5 +248,13 @@ + + + + org.lz4 + lz4-java + 1.8.0 + + diff --git a/hudi-common/src/main/avro/HoodieRollbackPlan.avsc b/hudi-common/src/main/avro/HoodieRollbackPlan.avsc new file mode 100644 index 0000000000000..99e0755bd52ce --- /dev/null +++ b/hudi-common/src/main/avro/HoodieRollbackPlan.avsc @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "HoodieRollbackPlan", + "fields": [ + { + "name": "instantToRollback", + "doc": "Hoodie instant that needs to be rolled back", + "type": ["null", "HoodieInstantInfo"], + "default": null + }, + { + "name": "RollbackRequests", + "type":["null", { + "type":"array", + "items":{ + "type": "record", + "name": "HoodieRollbackRequest", + "fields": [ + {"name": "partitionPath", "type": "string"}, + {"name": "fileId", + "type":["null", "string"], + "default": null + }, + {"name": "latestBaseInstant", + "type":["null", "string"], + "default": null + }, + {"name": "filesToBeDeleted", + "default": [], + "type": { + "type": "array", + "default": [], + "items": "string" + } + }, + {"name": "logBlocksToBeDeleted", + "type": ["null", { + "type": "map", + "doc": "Log blocks that need to be deleted as part of the rollback", + "values": { + "type": "long", + "doc": "Size of this file/block in bytes" + } + }], + "default":null + } + ] + } + }], + "default" : null + }, + { + "name":"version", + "type":["int", "null"], + "default": 1 + } + ] +} diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index 96b1a1e83abb5..91c214713e31b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -18,8 +18,6 @@ package org.apache.hudi.avro; -import org.apache.avro.specific.SpecificRecordBase; - import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; @@ -49,12 +47,14 @@ import org.apache.avro.io.EncoderFactory; import org.apache.avro.io.JsonDecoder; import org.apache.avro.io.JsonEncoder; +import org.apache.avro.specific.SpecificRecordBase; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.math.BigDecimal; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.time.LocalDate; @@ -546,8 +546,11 @@ private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object return decimalConversion.fromFixed((GenericFixed) fieldValue, fieldSchema, LogicalTypes.decimal(dc.getPrecision(), dc.getScale())); } else if (fieldSchema.getType() == Schema.Type.BYTES) { - return decimalConversion.fromBytes((ByteBuffer) fieldValue, fieldSchema, - LogicalTypes.decimal(dc.getPrecision(), dc.getScale())); + ByteBuffer byteBuffer = (ByteBuffer) fieldValue; + BigDecimal convertedValue = decimalConversion.fromBytes(byteBuffer, fieldSchema, + LogicalTypes.decimal(dc.getPrecision(), dc.getScale())); + byteBuffer.rewind(); + return convertedValue; } } return fieldValue; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java index 18b7de2fdd907..08e1bb4a87a50 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java @@ -74,7 +74,7 @@ public static String getDescription(Names names) { + "Hudi stats and metrics."; break; case KAFKA_CONNECT: - description = "These set of configs are used for Kakfa Connect Sink Connector for writing Hudi Tables"; + description = "These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables"; break; default: description = "Please fill in the description for Config Group Name: " + names.name; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java index 1f646aa8d0ff1..ed2b90eeae4f2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java @@ -74,6 +74,10 @@ public void setDefaultValue(ConfigProperty configProperty, T defaultVal) } } + public Boolean contains(String key) { + return props.containsKey(key); + } + public boolean contains(ConfigProperty configProperty) { if (props.containsKey(configProperty.key())) { return true; @@ -135,7 +139,7 @@ public Boolean getBoolean(ConfigProperty configProperty) { public boolean getBooleanOrDefault(ConfigProperty configProperty) { Option rawValue = getRawValue(configProperty); return rawValue.map(v -> Boolean.parseBoolean(v.toString())) - .orElse((Boolean) configProperty.defaultValue()); + .orElse(Boolean.parseBoolean(configProperty.defaultValue().toString())); } public Long getLong(ConfigProperty configProperty) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java index f55f46bb69488..d526294407bb9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java @@ -41,23 +41,9 @@ public final class HoodieMetadataConfig extends HoodieConfig { // Enable the internal Metadata Table which saves file listings public static final ConfigProperty ENABLE = ConfigProperty .key(METADATA_PREFIX + ".enable") - .defaultValue(false) - .sinceVersion("0.7.0") - .withDocumentation("Enable the internal metadata table which serves table metadata like level file listings"); - - // Enable syncing the Metadata Table - public static final ConfigProperty SYNC_ENABLE = ConfigProperty - .key(METADATA_PREFIX + ".sync.enable") .defaultValue(true) - .sinceVersion("0.9.0") - .withDocumentation("Enable syncing of metadata table from actions on the dataset"); - - // Validate contents of Metadata Table on each access against the actual filesystem - public static final ConfigProperty VALIDATE_ENABLE = ConfigProperty - .key(METADATA_PREFIX + ".validate") - .defaultValue(false) .sinceVersion("0.7.0") - .withDocumentation("Validate contents of metadata table on each access; e.g against the actual listings from lake storage"); + .withDocumentation("Enable the internal metadata table which serves table metadata like level file listings"); public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = false; @@ -85,7 +71,7 @@ public final class HoodieMetadataConfig extends HoodieConfig { // Maximum delta commits before compaction occurs public static final ConfigProperty COMPACT_NUM_DELTA_COMMITS = ConfigProperty .key(METADATA_PREFIX + ".compact.max.delta.commits") - .defaultValue(24) + .defaultValue(10) .sinceVersion("0.7.0") .withDocumentation("Controls how often the metadata table is compacted."); @@ -129,6 +115,20 @@ public final class HoodieMetadataConfig extends HoodieConfig { .sinceVersion("0.7.0") .withDocumentation("Parallelism to use, when listing the table on lake storage."); + public static final ConfigProperty ENABLE_INLINE_READING = ConfigProperty + .key(METADATA_PREFIX + ".enable.inline.reading") + .defaultValue(true) + .sinceVersion("0.10.0") + .withDocumentation("Enable inline reading of Log files. By default log block contents are read as byte[] using regular input stream and records " + + "are deserialized from it. Enabling this will read each log block as an inline file and read records from the same. For instance, " + + "for HFileDataBlock, a inline file will be read using HFileReader."); + + public static final ConfigProperty ENABLE_FULL_SCAN_LOG_FILES = ConfigProperty + .key(METADATA_PREFIX + ".enable.full.scan.log.files") + .defaultValue(true) + .sinceVersion("0.10.0") + .withDocumentation("Enable full scanning of log files while reading log records. If disabled, hudi does look up of only interested entries."); + private HoodieMetadataConfig() { super(); } @@ -149,14 +149,6 @@ public boolean enabled() { return getBoolean(ENABLE); } - public boolean enableSync() { - return enabled() && getBoolean(HoodieMetadataConfig.SYNC_ENABLE); - } - - public boolean validateFileListingMetadata() { - return getBoolean(VALIDATE_ENABLE); - } - public boolean enableMetrics() { return getBoolean(METRICS_ENABLE); } @@ -165,6 +157,10 @@ public String getDirectoryFilterRegex() { return getString(DIR_FILTER_REGEX); } + public boolean enableFullScan() { + return getBoolean(ENABLE_FULL_SCAN_LOG_FILES); + } + public static class Builder { private final HoodieMetadataConfig metadataConfig = new HoodieMetadataConfig(); @@ -186,21 +182,11 @@ public Builder enable(boolean enable) { return this; } - public Builder enableSync(boolean enable) { - metadataConfig.setValue(SYNC_ENABLE, String.valueOf(enable)); - return this; - } - public Builder enableMetrics(boolean enableMetrics) { metadataConfig.setValue(METRICS_ENABLE, String.valueOf(enableMetrics)); return this; } - public Builder validate(boolean validate) { - metadataConfig.setValue(VALIDATE_ENABLE, String.valueOf(validate)); - return this; - } - public Builder withInsertParallelism(int parallelism) { metadataConfig.setValue(INSERT_PARALLELISM_VALUE, String.valueOf(parallelism)); return this; @@ -242,6 +228,11 @@ public Builder withDirectoryFilterRegex(String regex) { return this; } + public Builder enableFullScan(boolean enableFullScan) { + metadataConfig.setValue(ENABLE_FULL_SCAN_LOG_FILES, String.valueOf(enableFullScan)); + return this; + } + public HoodieMetadataConfig build() { metadataConfig.setDefaults(HoodieMetadataConfig.class.getName()); return metadataConfig; @@ -258,16 +249,6 @@ public HoodieMetadataConfig build() { */ @Deprecated public static final boolean DEFAULT_METADATA_ENABLE = ENABLE.defaultValue(); - /** - * @deprecated Use {@link #VALIDATE_ENABLE} and its methods. - */ - @Deprecated - public static final String METADATA_VALIDATE_PROP = VALIDATE_ENABLE.key(); - /** - * @deprecated Use {@link #VALIDATE_ENABLE} and its methods. - */ - @Deprecated - public static final boolean DEFAULT_METADATA_VALIDATE = VALIDATE_ENABLE.defaultValue(); /** * @deprecated Use {@link #METRICS_ENABLE} and its methods. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java new file mode 100644 index 0000000000000..61fb98e1acc25 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import java.io.Serializable; + +/** + * An abstraction for accumulator on counts. + */ +public abstract class HoodieAccumulator implements Serializable { + /** + * @return the count. + */ + public abstract long value(); + + /** + * Increments the count based on the input. + * + * @param increment the value to add. + */ + public abstract void add(long increment); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAtomicLongAccumulator.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAtomicLongAccumulator.java new file mode 100644 index 0000000000000..3ace1c7a4a099 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAtomicLongAccumulator.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import java.util.concurrent.atomic.AtomicLong; + +/** + * An accumulator on counts based on {@link AtomicLong} implementation. + */ +public class HoodieAtomicLongAccumulator extends HoodieAccumulator { + + private final AtomicLong accumulator; + + private HoodieAtomicLongAccumulator() { + accumulator = new AtomicLong(0L); + } + + public static HoodieAtomicLongAccumulator create() { + return new HoodieAtomicLongAccumulator(); + } + + @Override + public long value() { + return accumulator.get(); + } + + @Override + public void add(long increment) { + accumulator.addAndGet(increment); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieData.java new file mode 100644 index 0000000000000..7ea7e0d649f34 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieData.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; + +import java.io.Serializable; +import java.util.Iterator; +import java.util.List; + +/** + * An abstraction for a data collection of objects in type T to store the reference + * and do transformation. + * + * @param type of object. + */ +public abstract class HoodieData implements Serializable { + /** + * @return the collection of objects. + */ + public abstract Object get(); + + /** + * Caches the data. + * + * @param cacheConfig config value for caching. + */ + public abstract void persist(String cacheConfig); + + /** + * Removes the cached data. + */ + public abstract void unpersist(); + + /** + * @return whether the collection is empty. + */ + public abstract boolean isEmpty(); + + /** + * @return the number of objects. + */ + public abstract long count(); + + /** + * @param func serializable map function. + * @param output object type. + * @return {@link HoodieData} containing the result. Actual execution may be deferred. + */ + public abstract HoodieData map(SerializableFunction func); + + /** + * @param func serializable map function by taking a partition of objects + * and generating an iterator. + * @param preservesPartitioning whether to preserve partitions in the result. + * @param output object type. + * @return {@link HoodieData} containing the result. Actual execution may be deferred. + */ + public abstract HoodieData mapPartitions( + SerializableFunction, Iterator> func, boolean preservesPartitioning); + + /** + * @param func serializable flatmap function. + * @param output object type. + * @return {@link HoodieData} containing the result. Actual execution may be deferred. + */ + public abstract HoodieData flatMap(SerializableFunction> func); + + /** + * @param mapToPairFunc serializable map function to generate a pair. + * @param key type of the pair. + * @param value type of the pair. + * @return {@link HoodiePairData} containing the result. Actual execution may be deferred. + */ + public abstract HoodiePairData mapToPair(SerializablePairFunction mapToPairFunc); + + /** + * @return distinct objects in {@link HoodieData}. + */ + public abstract HoodieData distinct(); + + /** + * @return collected results in {@link List}. + */ + public abstract List collectAsList(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieList.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieList.java new file mode 100644 index 0000000000000..6c23fdff22166 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieList.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; + +/** + * Holds a {@link List} of objects. + * + * @param type of object. + */ +public class HoodieList extends HoodieData { + + private final List listData; + + private HoodieList(List listData) { + this.listData = listData; + } + + /** + * @param listData a {@link List} of objects in type T. + * @param type of object. + * @return a new instance containing the {@link List} reference. + */ + public static HoodieList of(List listData) { + return new HoodieList<>(listData); + } + + /** + * @param hoodieData {@link HoodieList } instance containing the {@link List} of objects. + * @param type of object. + * @return the a {@link List} of objects in type T. + */ + public static List getList(HoodieData hoodieData) { + return ((HoodieList) hoodieData).get(); + } + + @Override + public List get() { + return listData; + } + + @Override + public void persist(String cacheConfig) { + // No OP + } + + @Override + public void unpersist() { + // No OP + } + + @Override + public boolean isEmpty() { + return listData.isEmpty(); + } + + @Override + public long count() { + return listData.size(); + } + + @Override + public HoodieData map(SerializableFunction func) { + return HoodieList.of(listData.stream().parallel() + .map(throwingMapWrapper(func)).collect(Collectors.toList())); + } + + @Override + public HoodieData mapPartitions(SerializableFunction, Iterator> func, boolean preservesPartitioning) { + List result = new ArrayList<>(); + throwingMapWrapper(func).apply(listData.iterator()).forEachRemaining(result::add); + return HoodieList.of(result); + } + + @Override + public HoodieData flatMap(SerializableFunction> func) { + Function> throwableFunc = throwingMapWrapper(func); + return HoodieList.of(listData.stream().flatMap(e -> { + List result = new ArrayList<>(); + Iterator iterator = throwableFunc.apply(e); + iterator.forEachRemaining(result::add); + return result.stream(); + }).collect(Collectors.toList())); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction mapToPairFunc) { + Map> mapOfPairs = new HashMap<>(); + Function> throwableMapToPairFunc = throwingMapToPairWrapper(mapToPairFunc); + listData.forEach(data -> { + Pair pair = throwableMapToPairFunc.apply(data); + List list = mapOfPairs.computeIfAbsent(pair.getKey(), k -> new ArrayList<>()); + list.add(pair.getValue()); + }); + return HoodieMapPair.of(mapOfPairs); + } + + @Override + public HoodieData distinct() { + return HoodieList.of(new ArrayList<>(new HashSet<>(listData))); + } + + @Override + public List collectAsList() { + return listData; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieMapPair.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieMapPair.java new file mode 100644 index 0000000000000..c941231e617f1 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieMapPair.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.FunctionWrapper; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; + +/** + * Implementation of {@link HoodiePairData} using Java {@link Map}. + * The pairs are organized by the key in the Map and values for the same key + * are stored in a list as the value corresponding to the key in the Map. + * + * @param type of key. + * @param type of value. + */ +public class HoodieMapPair extends HoodiePairData { + + private final Map> mapPairData; + + private HoodieMapPair(Map> mapPairData) { + this.mapPairData = mapPairData; + } + + /** + * @param mapPairData a {@link Map} of pairs. + * @param type of key. + * @param type of value. + * @return a new instance containing the {@link Map>} reference. + */ + public static HoodieMapPair of(Map> mapPairData) { + return new HoodieMapPair<>(mapPairData); + } + + /** + * @param hoodiePairData {@link HoodieMapPair } instance containing the {@link Map} of pairs. + * @param type of key. + * @param type of value. + * @return the {@link Map} of pairs. + */ + public static Map> getMapPair(HoodiePairData hoodiePairData) { + return ((HoodieMapPair) hoodiePairData).get(); + } + + @Override + public Map> get() { + return mapPairData; + } + + @Override + public void persist(String cacheConfig) { + // No OP + } + + @Override + public void unpersist() { + // No OP + } + + @Override + public HoodieData keys() { + return HoodieList.of(new ArrayList<>(mapPairData.keySet())); + } + + @Override + public HoodieData values() { + return HoodieList.of( + mapPairData.values().stream().flatMap(List::stream).collect(Collectors.toList())); + } + + @Override + public long count() { + return mapPairData.values().stream().map( + list -> (long) list.size()).reduce(Long::sum).orElse(0L); + } + + @Override + public Map countByKey() { + return mapPairData.entrySet().stream().collect( + Collectors.toMap(Map.Entry::getKey, entry -> (long) entry.getValue().size())); + } + + @Override + public HoodieData map(SerializableFunction, O> func) { + Function, O> throwableFunc = throwingMapWrapper(func); + return HoodieList.of( + streamAllPairs().map(throwableFunc).collect(Collectors.toList())); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction, L, W> mapToPairFunc) { + Map> newMap = new HashMap<>(); + Function, Pair> throwableMapToPairFunc = + FunctionWrapper.throwingMapToPairWrapper(mapToPairFunc); + streamAllPairs().map(pair -> throwableMapToPairFunc.apply(pair)).forEach(newPair -> { + List list = newMap.computeIfAbsent(newPair.getKey(), k -> new ArrayList<>()); + list.add(newPair.getValue()); + }); + return HoodieMapPair.of(newMap); + } + + @Override + public HoodiePairData>> leftOuterJoin(HoodiePairData other) { + Map> otherMapPairData = HoodieMapPair.getMapPair(other); + Stream>>>> pairs = streamAllPairs() + .map(pair -> new ImmutablePair<>(pair.getKey(), new ImmutablePair<>( + pair.getValue(), Option.ofNullable(otherMapPairData.get(pair.getKey()))))); + Map>>> resultMap = new HashMap<>(); + pairs.forEach(pair -> { + K key = pair.getKey(); + ImmutablePair>> valuePair = pair.getValue(); + List>> resultList = resultMap.computeIfAbsent(key, k -> new ArrayList<>()); + if (!valuePair.getRight().isPresent()) { + resultList.add(new ImmutablePair<>(valuePair.getLeft(), Option.empty())); + } else { + resultList.addAll(valuePair.getRight().get().stream().map( + w -> new ImmutablePair<>(valuePair.getLeft(), Option.of(w))).collect(Collectors.toList())); + } + }); + return HoodieMapPair.of(resultMap); + } + + private Stream> streamAllPairs() { + return mapPairData.entrySet().stream().flatMap( + entry -> entry.getValue().stream().map(e -> new ImmutablePair<>(entry.getKey(), e))); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodiePairData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodiePairData.java new file mode 100644 index 0000000000000..b9bdcb3d9ff4a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodiePairData.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; + +import java.io.Serializable; +import java.util.Map; + +/** + * An abstraction for pairs of key in type K and value in type V to store the reference + * and do transformation. + * + * @param type of key. + * @param type of value. + */ +public abstract class HoodiePairData implements Serializable { + /** + * @return the collection of pairs. + */ + public abstract Object get(); + + /** + * Caches the data. + * + * @param cacheConfig config value for caching. + */ + public abstract void persist(String cacheConfig); + + /** + * Removes the cached data. + */ + public abstract void unpersist(); + + /** + * @return all keys in {@link HoodieData}. + */ + public abstract HoodieData keys(); + + /** + * @return all values in {@link HoodieData}. + */ + public abstract HoodieData values(); + + /** + * @return the number of pairs. + */ + public abstract long count(); + + /** + * @return the number of pairs per key in a {@link Map}. + */ + public abstract Map countByKey(); + + /** + * @param func serializable map function. + * @param output object type. + * @return {@link HoodieData} containing the result. Actual execution may be deferred. + */ + public abstract HoodieData map(SerializableFunction, O> func); + + /** + * @param mapToPairFunc serializable map function to generate another pair. + * @param new key type. + * @param new value type. + * @return {@link HoodiePairData} containing the result. Actual execution may be deferred. + */ + public abstract HoodiePairData mapToPair( + SerializablePairFunction, L, W> mapToPairFunc); + + /** + * Performs a left outer join of this and other. For each element (k, v) in this, + * the resulting HoodiePairData will either contain all pairs (k, (v, Some(w))) for w in other, + * or the pair (k, (v, None)) if no elements in other have key k. + * + * @param other the other {@link HoodiePairData} + * @param value type of the other {@link HoodiePairData} + * @return {@link HoodiePairData>>} containing the left outer join result. + * Actual execution may be deferred. + */ + public abstract HoodiePairData>> leftOuterJoin(HoodiePairData other); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieEngineContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieEngineContext.java index 0128ce52b85ff..d400a10f68a10 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieEngineContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieEngineContext.java @@ -19,11 +19,18 @@ package org.apache.hudi.common.engine; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.function.SerializableBiFunction; import org.apache.hudi.common.function.SerializableConsumer; import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.stream.Stream; @@ -54,8 +61,28 @@ public TaskContextSupplier getTaskContextSupplier() { return taskContextSupplier; } + public abstract HoodieAccumulator newAccumulator(); + + public abstract HoodieData emptyHoodieData(); + + public HoodieData parallelize(List data) { + return parallelize(data, data.size()); + } + + public abstract HoodieData parallelize(List data, int parallelism); + public abstract List map(List data, SerializableFunction func, int parallelism); + public abstract List mapToPairAndReduceByKey( + List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism); + + public abstract Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism); + + public abstract List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism); + public abstract List flatMap(List data, SerializableFunction> func, int parallelism); public abstract void foreach(List data, SerializableConsumer consumer, int parallelism); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieLocalEngineContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieLocalEngineContext.java index e8045670244b2..c99430e284db9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieLocalEngineContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieLocalEngineContext.java @@ -19,24 +19,37 @@ package org.apache.hudi.common.engine; import org.apache.hadoop.conf.Configuration; + import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieAtomicLongAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; +import org.apache.hudi.common.function.SerializableBiFunction; import org.apache.hudi.common.function.SerializableConsumer; import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.stream.Collectors; import java.util.stream.Stream; import static java.util.stream.Collectors.toList; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; /** * A java based engine context, use this implementation on the query engine integrations if needed. @@ -51,11 +64,56 @@ public HoodieLocalEngineContext(Configuration conf, TaskContextSupplier taskCont super(new SerializableConfiguration(conf), taskContextSupplier); } + @Override + public HoodieAccumulator newAccumulator() { + return HoodieAtomicLongAccumulator.create(); + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieList.of(Collections.emptyList()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieList.of(data); + } + @Override public List map(List data, SerializableFunction func, int parallelism) { return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList()); } + @Override + public List mapToPairAndReduceByKey( + List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).get()) + .collect(Collectors.toList()); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() + .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( + Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + .filter(Objects::nonNull); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel() + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList()); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index 5c439f51aa5b0..8273ca7f35f80 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -26,7 +26,6 @@ import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ImmutablePair; @@ -167,6 +166,9 @@ public static String getCommitFromCommitFile(String commitFileName) { } public static String getCommitTime(String fullFileName) { + if (isLogFile(new Path(fullFileName))) { + return fullFileName.split("_")[1].split("\\.")[0]; + } return fullFileName.split("_")[2].split("\\.")[0]; } @@ -268,11 +270,10 @@ public static void processFiles(FileSystem fs, String basePathStr, Function getAllPartitionPaths(HoodieEngineContext engineContext, String basePathStr, - boolean useFileListingFromMetadata, boolean verifyListings, + boolean useFileListingFromMetadata, boolean assumeDatePartitioning) { HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() .enable(useFileListingFromMetadata) - .validate(verifyListings) .withAssumeDatePartitioning(assumeDatePartitioning) .build(); try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr, @@ -534,15 +535,6 @@ public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final return recovered; } - public static void deleteInstantFile(FileSystem fs, String metaPath, HoodieInstant instant) { - try { - LOG.warn("try to delete instant file: " + instant); - fs.delete(new Path(metaPath, instant.getFileName()), false); - } catch (IOException e) { - throw new HoodieIOException("Could not delete instant file" + instant.getFileName(), e); - } - } - public static void createPathIfNotExists(FileSystem fs, Path partitionPath) throws IOException { if (!fs.exists(partitionPath)) { fs.mkdirs(partitionPath); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java index e166fc48e6d99..1cc00df4ebd7a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java @@ -62,6 +62,8 @@ public enum StorageSchemes { OBS("obs", false), // Kingsoft Standard Storage ks3 KS3("ks3", false), + // JuiceFileSystem + JFS("jfs", true), // Baidu Object Storage BOS("bos", false); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/FunctionWrapper.java b/hudi-common/src/main/java/org/apache/hudi/common/function/FunctionWrapper.java index 405f57eeedfc5..40e1a9d3f7c46 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/function/FunctionWrapper.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/FunctionWrapper.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; +import java.util.function.BinaryOperator; import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Stream; @@ -70,4 +71,25 @@ public static Function> throwingMapToPairWrapper(Seriali } }; } + + public static Function>> throwingFlatMapToPairWrapper( + SerializablePairFlatMapFunction throwingPairFlatMapFunction) { + return v1 -> { + try { + return throwingPairFlatMapFunction.call(v1); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing mapToPair", e); + } + }; + } + + public static BinaryOperator throwingReduceWrapper(SerializableBiFunction throwingReduceFunction) { + return (v1, v2) -> { + try { + return throwingReduceFunction.apply(v1, v2); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing mapToPair", e); + } + }; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableBiFunction.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableBiFunction.java new file mode 100644 index 0000000000000..940396cf8e1ec --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableBiFunction.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.function; + +import java.io.Serializable; + +/** + * A function that accepts two arguments and produces a result. + * + * @param the type of the first argument to the function + * @param the type of the second argument to the function + * @param the type of the result of the function + */ +@FunctionalInterface +public interface SerializableBiFunction extends Serializable { + R apply(T t, U u); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFlatMapFunction.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFlatMapFunction.java new file mode 100644 index 0000000000000..4cc34ce6ee84c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFlatMapFunction.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.function; + +import org.apache.hudi.common.util.collection.Pair; + +import java.io.Serializable; +import java.util.stream.Stream; + +/** + * A function that returns a stream of key-value pairs (Pair<K, V>). + */ +@FunctionalInterface +public interface SerializablePairFlatMapFunction extends Serializable { + Stream> call(I t) throws Exception; +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java index 86ccf673ee9d1..76474fde66eae 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.model; -import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.util.Option; import org.apache.avro.Schema; @@ -56,7 +55,7 @@ public Option combineAndGetUpdateValue(IndexedRecord currentValue if (recordBytes.length == 0) { return Option.empty(); } - HoodieConfig hoodieConfig = new HoodieConfig(properties); + GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); // Null check is needed here to support schema evolution. The record in storage may be from old schema where @@ -68,17 +67,27 @@ public Option combineAndGetUpdateValue(IndexedRecord currentValue /* * We reached a point where the value is disk is older than the incoming record. */ - eventTime = Option.ofNullable(getNestedFieldVal(incomingRecord, hoodieConfig - .getString(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY), true)); + eventTime = updateEventTime(incomingRecord, properties); /* * Now check if the incoming record is a delete record. */ - if (isDeleteRecord(incomingRecord)) { + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + @Override + public Option getInsertValue(Schema schema, Properties properties) throws IOException { + if (recordBytes.length == 0) { return Option.empty(); - } else { - return Option.of(incomingRecord); } + GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); + eventTime = updateEventTime(incomingRecord, properties); + + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + private static Option updateEventTime(GenericRecord record, Properties properties) { + return Option.ofNullable(getNestedFieldVal(record, properties.getProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY), true)); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java new file mode 100644 index 0000000000000..ca977ae53b5f9 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.parquet.schema.PrimitiveStringifier; + +import java.util.Objects; + +/** + * Hoodie Range metadata. + */ +public class HoodieColumnRangeMetadata { + private final String filePath; + private final String columnName; + private final T minValue; + private final T maxValue; + private final long numNulls; + private final PrimitiveStringifier stringifier; + + public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls, final PrimitiveStringifier stringifier) { + this.filePath = filePath; + this.columnName = columnName; + this.minValue = minValue; + this.maxValue = maxValue; + this.numNulls = numNulls; + this.stringifier = stringifier; + } + + public String getFilePath() { + return this.filePath; + } + + public String getColumnName() { + return this.columnName; + } + + public T getMinValue() { + return this.minValue; + } + + public T getMaxValue() { + return this.maxValue; + } + + public PrimitiveStringifier getStringifier() { + return stringifier; + } + + public long getNumNulls() { + return numNulls; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final HoodieColumnRangeMetadata that = (HoodieColumnRangeMetadata) o; + return Objects.equals(getFilePath(), that.getFilePath()) + && Objects.equals(getColumnName(), that.getColumnName()) + && Objects.equals(getMinValue(), that.getMinValue()) + && Objects.equals(getMaxValue(), that.getMaxValue()) + && Objects.equals(getNumNulls(), that.getNumNulls()); + } + + @Override + public int hashCode() { + return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNumNulls()); + } + + @Override + public String toString() { + return "HoodieColumnRangeMetadata{" + + "filePath ='" + filePath + '\'' + + "columnName='" + columnName + '\'' + + ", minValue=" + minValue + + ", maxValue=" + maxValue + + ", numNulls=" + numNulls + '}'; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java index da72b165f2686..c1e8cbf08b11c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java @@ -27,6 +27,7 @@ import com.fasterxml.jackson.annotation.PropertyAccessor; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -104,8 +105,8 @@ public void setCompacted(Boolean compacted) { public HashMap getFileIdAndRelativePaths() { HashMap filePaths = new HashMap<>(); // list all partitions paths - for (Map.Entry> entry : getPartitionToWriteStats().entrySet()) { - for (HoodieWriteStat stat : entry.getValue()) { + for (List stats : getPartitionToWriteStats().values()) { + for (HoodieWriteStat stat : stats) { filePaths.put(stat.getFileId(), stat.getPath()); } } @@ -142,6 +143,60 @@ public Map getFileGroupIdAndFullPaths(String basePath return fileGroupIdToFullPaths; } + /** + * Extract the file status of all affected files from the commit metadata. If a file has + * been touched multiple times in the given commits, the return value will keep the one + * from the latest commit. + * + * @param basePath The base path + * @return the file full path to file status mapping + */ + public Map getFullPathToFileStatus(String basePath) { + Map fullPathToFileStatus = new HashMap<>(); + for (List stats : getPartitionToWriteStats().values()) { + // Iterate through all the written files. + for (HoodieWriteStat stat : stats) { + String relativeFilePath = stat.getPath(); + Path fullPath = relativeFilePath != null ? FSUtils.getPartitionPath(basePath, relativeFilePath) : null; + if (fullPath != null) { + FileStatus fileStatus = new FileStatus(stat.getFileSizeInBytes(), false, 0, 0, + 0, fullPath); + fullPathToFileStatus.put(fullPath.getName(), fileStatus); + } + } + } + return fullPathToFileStatus; + } + + /** + * Extract the file status of all affected files from the commit metadata. If a file has + * been touched multiple times in the given commits, the return value will keep the one + * from the latest commit by file group ID. + * + *

Note: different with {@link #getFullPathToFileStatus(String)}, + * only the latest commit file for a file group is returned, + * this is an optimization for COPY_ON_WRITE table to eliminate legacy files for filesystem view. + * + * @param basePath The base path + * @return the file ID to file status mapping + */ + public Map getFileIdToFileStatus(String basePath) { + Map fileIdToFileStatus = new HashMap<>(); + for (List stats : getPartitionToWriteStats().values()) { + // Iterate through all the written files. + for (HoodieWriteStat stat : stats) { + String relativeFilePath = stat.getPath(); + Path fullPath = relativeFilePath != null ? FSUtils.getPartitionPath(basePath, relativeFilePath) : null; + if (fullPath != null) { + FileStatus fileStatus = new FileStatus(stat.getFileSizeInBytes(), false, 0, 0, + 0, fullPath); + fileIdToFileStatus.put(stat.getFileId(), fileStatus); + } + } + } + return fileIdToFileStatus; + } + public String toJsonString() throws IOException { if (partitionToWriteStats.containsKey(null)) { LOG.info("partition path is null for " + partitionToWriteStats.get(null)); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java index 7ebf9887af690..36dd30b659dbf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java @@ -45,7 +45,7 @@ public interface HoodieRecordPayload extends Seri T preCombine(T oldValue); /** - * When more than one HoodieRecord have the same HoodieKey, this function combines them before attempting to insert/upsert by taking in a property map. + * When more than one HoodieRecord have the same HoodieKey in the incoming batch, this function combines them before attempting to insert/upsert by taking in a property map. * Implementation can leverage the property to decide their business logic to do preCombine. * * @param oldValue instance of the old {@link HoodieRecordPayload} to be combined with. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java index 8c78209788d23..4be2e3e093e90 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.model; +import org.apache.avro.JsonProperties; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.util.Option; @@ -99,6 +100,9 @@ protected boolean isDeleteRecord(GenericRecord genericRecord) { * Return true if value equals defaultValue otherwise false. */ public Boolean overwriteField(Object value, Object defaultValue) { + if (JsonProperties.NULL_VALUE.equals(defaultValue)) { + return value == null; + } return Objects.equals(value, defaultValue); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index 6d3a7d0015057..dc57fd1c6ff8b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -31,6 +31,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.avro.Schema; import org.apache.hadoop.fs.FSDataInputStream; @@ -41,7 +42,6 @@ import org.apache.log4j.Logger; import java.io.IOException; -import java.io.Serializable; import java.util.Arrays; import java.util.Date; import java.util.Map; @@ -63,7 +63,7 @@ + "initializing a path as hoodie base path and rarely changes during " + "the lifetime of the table. Writers/Queries' configurations are validated against these " + " each time for compatibility.") -public class HoodieTableConfig extends HoodieConfig implements Serializable { +public class HoodieTableConfig extends HoodieConfig { private static final Logger LOG = LogManager.getLogger(HoodieTableConfig.class); @@ -136,10 +136,10 @@ public class HoodieTableConfig extends HoodieConfig implements Serializable { .defaultValue("archived") .withDocumentation("path under the meta folder, to store archived timeline instants at."); - public static final ConfigProperty BOOTSTRAP_INDEX_ENABLE = ConfigProperty + public static final ConfigProperty BOOTSTRAP_INDEX_ENABLE = ConfigProperty .key("hoodie.bootstrap.index.enable") - .noDefaultValue() - .withDocumentation("Whether or not, this is a bootstrapped table, with bootstrap base data and an mapping index defined."); + .defaultValue(true) + .withDocumentation("Whether or not, this is a bootstrapped table, with bootstrap base data and an mapping index defined, default true."); public static final ConfigProperty BOOTSTRAP_INDEX_CLASS_NAME = ConfigProperty .key("hoodie.bootstrap.index.class") @@ -162,6 +162,9 @@ public class HoodieTableConfig extends HoodieConfig implements Serializable { .noDefaultValue() .withDocumentation("Key Generator class property for the hoodie table"); + public static final ConfigProperty URL_ENCODE_PARTITIONING = KeyGeneratorOptions.URL_ENCODE_PARTITIONING; + public static final ConfigProperty HIVE_STYLE_PARTITIONING_ENABLE = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE; + public static final String NO_OP_BOOTSTRAP_INDEX_CLASS = NoOpBootstrapIndex.class.getName(); public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName) { @@ -298,8 +301,9 @@ public String getBootstrapIndexClass() { } public static String getDefaultBootstrapIndexClass(Properties props) { + HoodieConfig hoodieConfig = new HoodieConfig(props); String defaultClass = BOOTSTRAP_INDEX_CLASS_NAME.defaultValue(); - if ("false".equalsIgnoreCase(props.getProperty(BOOTSTRAP_INDEX_ENABLE.key()))) { + if (!hoodieConfig.getBooleanOrDefault(BOOTSTRAP_INDEX_ENABLE)) { defaultClass = NO_OP_BOOTSTRAP_INDEX_CLASS; } return defaultClass; @@ -363,6 +367,18 @@ public String getRecordKeyFieldProp() { return getString(RECORDKEY_FIELDS); } + public String getKeyGeneratorClassName() { + return getString(KEY_GENERATOR_CLASS_NAME); + } + + public String getHiveStylePartitioningEnable() { + return getString(HIVE_STYLE_PARTITIONING_ENABLE); + } + + public String getUrlEncodePartitoning() { + return getString(URL_ENCODE_PARTITIONING); + } + public Map propsMap() { return props.entrySet().stream() .collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index 97464b1b28d1c..450a3cc2eb3a8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -49,7 +49,6 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.Arrays; @@ -76,10 +75,11 @@ public class HoodieTableMetaClient implements Serializable { private static final long serialVersionUID = 1L; private static final Logger LOG = LogManager.getLogger(HoodieTableMetaClient.class); public static final String METAFOLDER_NAME = ".hoodie"; - public static final String TEMPFOLDER_NAME = METAFOLDER_NAME + File.separator + ".temp"; - public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + File.separator + ".aux"; - public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + File.separator + ".bootstrap"; - public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + File.separator + ".heartbeat"; + public static final String TEMPFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".temp"; + public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux"; + public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap"; + public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat"; + public static final String ZINDEX_NAME = ".zindex"; public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR + ".partitions"; public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR @@ -177,6 +177,13 @@ public String getMetaPath() { return metaPath; } + /** + * @return z-index path + */ + public String getZindexPath() { + return new Path(metaPath, ZINDEX_NAME).toString(); + } + /** * @return Temp Folder path */ @@ -205,7 +212,7 @@ public String getMetaAuxiliaryPath() { * @return Heartbeat folder path. */ public static String getHeartbeatFolderPath(String basePath) { - return String.format("%s%s%s", basePath, File.separator, HEARTBEAT_FOLDER_NAME); + return String.format("%s%s%s", basePath, Path.SEPARATOR, HEARTBEAT_FOLDER_NAME); } /** @@ -227,7 +234,7 @@ public String getBootstrapIndexByFileIdFolderNameFolderPath() { */ public String getArchivePath() { String archiveFolder = tableConfig.getArchivelogFolder(); - return getMetaPath() + "/" + archiveFolder; + return getMetaPath() + Path.SEPARATOR + archiveFolder; } /** @@ -627,8 +634,11 @@ public static class PropertyBuilder { private String partitionFields; private String bootstrapIndexClass; private String bootstrapBasePath; + private Boolean bootstrapIndexEnable; private Boolean populateMetaFields; private String keyGeneratorClassProp; + private Boolean hiveStylePartitioningEnable; + private Boolean urlEncodePartitioning; private PropertyBuilder() { @@ -702,6 +712,11 @@ public PropertyBuilder setBootstrapBasePath(String bootstrapBasePath) { return this; } + public PropertyBuilder setBootstrapIndexEnable(Boolean bootstrapIndexEnable) { + this.bootstrapIndexEnable = bootstrapIndexEnable; + return this; + } + public PropertyBuilder setPopulateMetaFields(boolean populateMetaFields) { this.populateMetaFields = populateMetaFields; return this; @@ -712,6 +727,16 @@ public PropertyBuilder setKeyGeneratorClassProp(String keyGeneratorClassProp) { return this; } + public PropertyBuilder setHiveStylePartitioningEnable(Boolean hiveStylePartitioningEnable) { + this.hiveStylePartitioningEnable = hiveStylePartitioningEnable; + return this; + } + + public PropertyBuilder setUrlEncodePartitioning(Boolean urlEncodePartitioning) { + this.urlEncodePartitioning = urlEncodePartitioning; + return this; + } + public PropertyBuilder fromMetaClient(HoodieTableMetaClient metaClient) { return setTableType(metaClient.getTableType()) .setTableName(metaClient.getTableConfig().getTableName()) @@ -749,6 +774,11 @@ public PropertyBuilder fromProperties(Properties properties) { if (hoodieConfig.contains(HoodieTableConfig.BOOTSTRAP_BASE_PATH)) { setBootstrapBasePath(hoodieConfig.getString(HoodieTableConfig.BOOTSTRAP_BASE_PATH)); } + + if (hoodieConfig.contains(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE)) { + setBootstrapIndexEnable(hoodieConfig.getBoolean(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE)); + } + if (hoodieConfig.contains(HoodieTableConfig.PRECOMBINE_FIELD)) { setPreCombineField(hoodieConfig.getString(HoodieTableConfig.PRECOMBINE_FIELD)); } @@ -768,6 +798,12 @@ public PropertyBuilder fromProperties(Properties properties) { if (hoodieConfig.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)) { setKeyGeneratorClassProp(hoodieConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)); } + if (hoodieConfig.contains(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)) { + setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)); + } + if (hoodieConfig.contains(HoodieTableConfig.URL_ENCODE_PARTITIONING)) { + setUrlEncodePartitioning(hoodieConfig.getBoolean(HoodieTableConfig.URL_ENCODE_PARTITIONING)); + } return this; } @@ -807,6 +843,10 @@ public Properties build() { tableConfig.setValue(HoodieTableConfig.BOOTSTRAP_INDEX_CLASS_NAME, bootstrapIndexClass); } + if (null != bootstrapIndexEnable) { + tableConfig.setValue(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE, Boolean.toString(bootstrapIndexEnable)); + } + if (null != bootstrapBasePath) { tableConfig.setValue(HoodieTableConfig.BOOTSTRAP_BASE_PATH, bootstrapBasePath); } @@ -827,6 +867,12 @@ public Properties build() { if (null != keyGeneratorClassProp) { tableConfig.setValue(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME, keyGeneratorClassProp); } + if (null != hiveStylePartitioningEnable) { + tableConfig.setValue(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE, Boolean.toString(hiveStylePartitioningEnable)); + } + if (null != urlEncodePartitioning) { + tableConfig.setValue(HoodieTableConfig.URL_ENCODE_PARTITIONING, Boolean.toString(urlEncodePartitioning)); + } return tableConfig.getProps(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java index 6bbc02d82a4b6..122c387756e88 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java @@ -32,7 +32,9 @@ public enum HoodieTableVersion { // 0.6.0 onwards ONE(1), // 0.9.0 onwards - TWO(2); + TWO(2), + // 0.10.0 onwards + THREE(3); private final int versionCode; @@ -45,7 +47,7 @@ public int versionCode() { } public static HoodieTableVersion current() { - return TWO; + return THREE; } public static HoodieTableVersion versionFromCode(int versionCode) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index 70b820c868999..51e3e273806c3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -149,7 +149,7 @@ private MessageType getTableParquetSchemaFromDataFile() throws Exception { } } - private Schema getTableAvroSchemaFromDataFile() throws Exception { + public Schema getTableAvroSchemaFromDataFile() throws Exception { return convertParquetSchemaToAvro(getTableParquetSchemaFromDataFile()); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java similarity index 83% rename from hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java rename to hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java index 868c7cb895c76..e2e76ad7d6503 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java @@ -47,6 +47,7 @@ import java.io.IOException; import java.util.ArrayDeque; +import java.util.ArrayList; import java.util.Arrays; import java.util.Deque; import java.util.HashSet; @@ -71,9 +72,9 @@ *

* This results in two I/O passes over the log file. */ -public abstract class AbstractHoodieLogRecordScanner { +public abstract class AbstractHoodieLogRecordReader { - private static final Logger LOG = LogManager.getLogger(AbstractHoodieLogRecordScanner.class); + private static final Logger LOG = LogManager.getLogger(AbstractHoodieLogRecordReader.class); // Reader schema for the records protected final Schema readerSchema; @@ -114,12 +115,23 @@ public abstract class AbstractHoodieLogRecordScanner { private AtomicLong totalCorruptBlocks = new AtomicLong(0); // Store the last instant log blocks (needed to implement rollback) private Deque currentInstantLogBlocks = new ArrayDeque<>(); + // Enables full scan of log records + protected final boolean enableFullScan; + private int totalScannedLogFiles; // Progress private float progress = 0.0f; - protected AbstractHoodieLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, - String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, - int bufferSize, Option instantRange, boolean withOperationField) { + protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, + String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, + int bufferSize, Option instantRange, boolean withOperationField) { + this(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, instantRange, withOperationField, + true); + } + + protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, + String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, + int bufferSize, Option instantRange, boolean withOperationField, + boolean enableFullScan) { this.readerSchema = readerSchema; this.latestInstantTime = latestInstantTime; this.hoodieTableMetaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).build(); @@ -132,18 +144,27 @@ protected AbstractHoodieLogRecordScanner(FileSystem fs, String basePath, List> keys) { + currentInstantLogBlocks = new ArrayDeque<>(); + progress = 0.0f; + totalLogFiles = new AtomicLong(0); + totalRollbacks = new AtomicLong(0); + totalCorruptBlocks = new AtomicLong(0); + totalLogBlocks = new AtomicLong(0); + totalLogRecords = new AtomicLong(0); HoodieLogFormatReader logFormatReaderWrapper = null; HoodieTimeline commitsTimeline = this.hoodieTableMetaClient.getCommitsTimeline(); HoodieTimeline completedInstantsTimeline = commitsTimeline.filterCompletedInstants(); @@ -152,7 +173,7 @@ public void scan() { // iterate over the paths logFormatReaderWrapper = new HoodieLogFormatReader(fs, logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), - readerSchema, readBlocksLazily, reverseReader, bufferSize); + readerSchema, readBlocksLazily, reverseReader, bufferSize, !enableFullScan); Set scannedLogFiles = new HashSet<>(); while (logFormatReaderWrapper.hasNext()) { HoodieLogFile logFile = logFormatReaderWrapper.getLogFile(); @@ -160,16 +181,16 @@ public void scan() { scannedLogFiles.add(logFile); totalLogFiles.set(scannedLogFiles.size()); // Use the HoodieLogFileReader to iterate through the blocks in the log file - HoodieLogBlock r = logFormatReaderWrapper.next(); - final String instantTime = r.getLogBlockHeader().get(INSTANT_TIME); + HoodieLogBlock logBlock = logFormatReaderWrapper.next(); + final String instantTime = logBlock.getLogBlockHeader().get(INSTANT_TIME); totalLogBlocks.incrementAndGet(); - if (r.getBlockType() != CORRUPT_BLOCK - && !HoodieTimeline.compareTimestamps(r.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime + if (logBlock.getBlockType() != CORRUPT_BLOCK + && !HoodieTimeline.compareTimestamps(logBlock.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime )) { // hit a block with instant time greater than should be processed, stop processing further break; } - if (r.getBlockType() != CORRUPT_BLOCK && r.getBlockType() != COMMAND_BLOCK) { + if (logBlock.getBlockType() != CORRUPT_BLOCK && logBlock.getBlockType() != COMMAND_BLOCK) { if (!completedInstantsTimeline.containsOrBeforeTimelineStarts(instantTime) || inflightInstantsTimeline.containsInstant(instantTime)) { // hit an uncommitted block possibly from a failed write, move to the next one and skip processing this one @@ -180,28 +201,28 @@ public void scan() { continue; } } - switch (r.getBlockType()) { + switch (logBlock.getBlockType()) { case HFILE_DATA_BLOCK: case AVRO_DATA_BLOCK: LOG.info("Reading a data block from file " + logFile.getPath() + " at instant " - + r.getLogBlockHeader().get(INSTANT_TIME)); - if (isNewInstantBlock(r) && !readBlocksLazily) { + + logBlock.getLogBlockHeader().get(INSTANT_TIME)); + if (isNewInstantBlock(logBlock) && !readBlocksLazily) { // If this is an avro data block belonging to a different commit/instant, // then merge the last blocks and records into the main result - processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size()); + processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys); } // store the current block - currentInstantLogBlocks.push(r); + currentInstantLogBlocks.push(logBlock); break; case DELETE_BLOCK: LOG.info("Reading a delete block from file " + logFile.getPath()); - if (isNewInstantBlock(r) && !readBlocksLazily) { + if (isNewInstantBlock(logBlock) && !readBlocksLazily) { // If this is a delete data block belonging to a different commit/instant, // then merge the last blocks and records into the main result - processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size()); + processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys); } // store deletes so can be rolled back - currentInstantLogBlocks.push(r); + currentInstantLogBlocks.push(logBlock); break; case COMMAND_BLOCK: // Consider the following scenario @@ -218,9 +239,9 @@ public void scan() { // both B1 & B2 LOG.info("Reading a command block from file " + logFile.getPath()); // This is a command block - take appropriate action based on the command - HoodieCommandBlock commandBlock = (HoodieCommandBlock) r; + HoodieCommandBlock commandBlock = (HoodieCommandBlock) logBlock; String targetInstantForCommandBlock = - r.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME); + logBlock.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME); switch (commandBlock.getType()) { // there can be different types of command blocks case ROLLBACK_PREVIOUS_BLOCK: // Rollback the last read log block @@ -264,7 +285,7 @@ public void scan() { LOG.info("Found a corrupt block in " + logFile.getPath()); totalCorruptBlocks.incrementAndGet(); // If there is a corrupt block - we will assume that this was the next data block - currentInstantLogBlocks.push(r); + currentInstantLogBlocks.push(logBlock); break; default: throw new UnsupportedOperationException("Block type not supported yet"); @@ -273,7 +294,7 @@ public void scan() { // merge the last read block when all the blocks are done reading if (!currentInstantLogBlocks.isEmpty()) { LOG.info("Merging the final data blocks"); - processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size()); + processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys); } // Done progress = 1.0f; @@ -308,9 +329,14 @@ private boolean isNewInstantBlock(HoodieLogBlock logBlock) { * Iterate over the GenericRecord in the block, read the hoodie key and partition path and call subclass processors to * handle it. */ - private void processDataBlock(HoodieDataBlock dataBlock) throws Exception { + private void processDataBlock(HoodieDataBlock dataBlock, Option> keys) throws Exception { // TODO (NA) - Implement getRecordItr() in HoodieAvroDataBlock and use that here - List recs = dataBlock.getRecords(); + List recs = new ArrayList<>(); + if (!keys.isPresent()) { + recs = dataBlock.getRecords(); + } else { + recs = dataBlock.getRecords(keys.get()); + } totalLogRecords.addAndGet(recs.size()); for (IndexedRecord rec : recs) { processNextRecord(createHoodieRecord(rec)); @@ -342,17 +368,18 @@ protected HoodieRecord createHoodieRecord(IndexedRecord rec) { /** * Process the set of log blocks belonging to the last instant which is read fully. */ - private void processQueuedBlocksForInstant(Deque lastBlocks, int numLogFilesSeen) throws Exception { - while (!lastBlocks.isEmpty()) { - LOG.info("Number of remaining logblocks to merge " + lastBlocks.size()); + private void processQueuedBlocksForInstant(Deque logBlocks, int numLogFilesSeen, + Option> keys) throws Exception { + while (!logBlocks.isEmpty()) { + LOG.info("Number of remaining logblocks to merge " + logBlocks.size()); // poll the element at the bottom of the stack since that's the order it was inserted - HoodieLogBlock lastBlock = lastBlocks.pollLast(); + HoodieLogBlock lastBlock = logBlocks.pollLast(); switch (lastBlock.getBlockType()) { case AVRO_DATA_BLOCK: - processDataBlock((HoodieAvroDataBlock) lastBlock); + processDataBlock((HoodieAvroDataBlock) lastBlock, keys); break; case HFILE_DATA_BLOCK: - processDataBlock((HoodieHFileDataBlock) lastBlock); + processDataBlock((HoodieHFileDataBlock) lastBlock, keys); break; case DELETE_BLOCK: Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey); @@ -432,6 +459,6 @@ public Builder withOperationField(boolean withOperationField) { throw new UnsupportedOperationException(); } - public abstract AbstractHoodieLogRecordScanner build(); + public abstract AbstractHoodieLogRecordReader build(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java index f0f3842e97b36..cdf3065587d13 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java @@ -70,17 +70,24 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader { private long reverseLogFilePosition; private long lastReverseLogFilePosition; private boolean reverseReader; + private boolean enableInlineReading; private boolean closed = false; private transient Thread shutdownThread = null; public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, boolean readBlockLazily, boolean reverseReader) throws IOException { + this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, false); + } + + public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, + boolean readBlockLazily, boolean reverseReader, boolean enableInlineReading) throws IOException { FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); this.logFile = logFile; this.inputStream = getFSDataInputStream(fsDataInputStream, fs, bufferSize); this.readerSchema = readerSchema; this.readBlockLazily = readBlockLazily; this.reverseReader = reverseReader; + this.enableInlineReading = enableInlineReading; if (this.reverseReader) { this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs.getFileStatus(logFile.getPath()).getLen(); } @@ -248,7 +255,7 @@ private HoodieLogBlock readBlock() throws IOException { } case HFILE_DATA_BLOCK: return new HoodieHFileDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, readerSchema, header, footer); + contentPosition, contentLength, blockEndPos, readerSchema, header, footer, enableInlineReading); case DELETE_BLOCK: return HoodieDeleteBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, contentPosition, contentLength, blockEndPos, header, footer); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java index 9b643ec6e16c8..c566788fd1667 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java @@ -56,6 +56,8 @@ public interface HoodieLogFormat { String UNKNOWN_WRITE_TOKEN = "1-0-1"; + String DEFAULT_WRITE_TOKEN = "0-0-0"; + /** * Writer interface to allow appending block to this file format. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java index 72672278b6b65..36fa187aa4111 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java @@ -49,7 +49,12 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { private static final Logger LOG = LogManager.getLogger(HoodieLogFormatReader.class); HoodieLogFormatReader(FileSystem fs, List logFiles, Schema readerSchema, boolean readBlocksLazily, - boolean reverseLogReader, int bufferSize) throws IOException { + boolean reverseLogReader, int bufferSize) throws IOException { + this(fs, logFiles, readerSchema, readBlocksLazily, reverseLogReader, bufferSize, false); + } + + HoodieLogFormatReader(FileSystem fs, List logFiles, Schema readerSchema, boolean readBlocksLazily, + boolean reverseLogReader, int bufferSize, boolean enableInlineReading) throws IOException { this.logFiles = logFiles; this.fs = fs; this.readerSchema = readerSchema; @@ -59,7 +64,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { this.prevReadersInOpenState = new ArrayList<>(); if (logFiles.size() > 0) { HoodieLogFile nextLogFile = logFiles.remove(0); - this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false); + this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false, enableInlineReading); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java index a68c8f17b1a28..a8d97ac1b5f18 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java @@ -54,7 +54,7 @@ * This results in two I/O passes over the log file. */ -public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordScanner +public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader implements Iterable> { private static final Logger LOG = LogManager.getLogger(HoodieMergedLogRecordScanner.class); @@ -77,8 +77,9 @@ protected HoodieMergedLogRecordScanner(FileSystem fs, String basePath, List instantRange, boolean autoScan, ExternalSpillableMap.DiskMapType diskMapType, boolean isBitCaskDiskMapCompressionEnabled, - boolean withOperationField) { - super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, instantRange, withOperationField); + boolean withOperationField, boolean enableFullScan) { + super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, instantRange, withOperationField, + enableFullScan); try { // Store merged records for all versions for this log file, set the in-memory footprint to maxInMemoryMapSize this.records = new ExternalSpillableMap<>(maxMemorySizeInBytes, spillableMapBasePath, new DefaultSizeEstimator(), @@ -166,7 +167,7 @@ public void close() { /** * Builder used to build {@code HoodieUnMergedLogRecordScanner}. */ - public static class Builder extends AbstractHoodieLogRecordScanner.Builder { + public static class Builder extends AbstractHoodieLogRecordReader.Builder { protected FileSystem fs; protected String basePath; protected List logFilePaths; @@ -181,52 +182,61 @@ public static class Builder extends AbstractHoodieLogRecordScanner.Builder { protected ExternalSpillableMap.DiskMapType diskMapType = HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue(); protected boolean isBitCaskDiskMapCompressionEnabled = HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue(); // incremental filtering - private Option instantRange = Option.empty(); + protected Option instantRange = Option.empty(); // auto scan default true private boolean autoScan = true; // operation field default false private boolean withOperationField = false; + @Override public Builder withFileSystem(FileSystem fs) { this.fs = fs; return this; } + @Override public Builder withBasePath(String basePath) { this.basePath = basePath; return this; } + @Override public Builder withLogFilePaths(List logFilePaths) { this.logFilePaths = logFilePaths; return this; } + @Override public Builder withReaderSchema(Schema schema) { this.readerSchema = schema; return this; } + @Override public Builder withLatestInstantTime(String latestInstantTime) { this.latestInstantTime = latestInstantTime; return this; } + @Override public Builder withReadBlocksLazily(boolean readBlocksLazily) { this.readBlocksLazily = readBlocksLazily; return this; } + @Override public Builder withReverseReader(boolean reverseReader) { this.reverseReader = reverseReader; return this; } + @Override public Builder withBufferSize(int bufferSize) { this.bufferSize = bufferSize; return this; } + @Override public Builder withInstantRange(Option instantRange) { this.instantRange = instantRange; return this; @@ -267,7 +277,7 @@ public HoodieMergedLogRecordScanner build() { return new HoodieMergedLogRecordScanner(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, readBlocksLazily, reverseReader, bufferSize, spillableMapBasePath, instantRange, autoScan, - diskMapType, isBitCaskDiskMapCompressionEnabled, withOperationField); + diskMapType, isBitCaskDiskMapCompressionEnabled, withOperationField, true); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index 8b26f72579c80..f781a148a3938 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -31,7 +31,7 @@ /** * A scanner used to scan hoodie unmerged log records. */ -public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordScanner { +public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordReader { private final LogRecordScannerCallback callback; @@ -72,7 +72,7 @@ public static interface LogRecordScannerCallback { /** * Builder used to build {@code HoodieUnMergedLogRecordScanner}. */ - public static class Builder extends AbstractHoodieLogRecordScanner.Builder { + public static class Builder extends AbstractHoodieLogRecordReader.Builder { private FileSystem fs; private String basePath; private List logFilePaths; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java index 8f5b741f37909..2e4338ef785d0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java @@ -111,6 +111,17 @@ public List getRecords() { return records; } + /** + * Batch get of keys of interest. Implementation can choose to either do full scan and return matched entries or + * do a seek based parsing and return matched entries. + * @param keys keys of interest. + * @return List of IndexedRecords for the keys of interest. + * @throws IOException + */ + public List getRecords(List keys) throws IOException { + throw new UnsupportedOperationException("On demand batch get based on interested keys not supported"); + } + public Schema getSchema() { // if getSchema was invoked before converting byte [] to records if (records == null) { @@ -119,7 +130,7 @@ public Schema getSchema() { return schema; } - private void createRecordsFromContentBytes() throws IOException { + protected void createRecordsFromContentBytes() throws IOException { if (readBlockLazily && !getContent().isPresent()) { // read log block contents from disk inflate(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java index 6d2682a4ffa09..a1e0c129803f7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java @@ -19,12 +19,16 @@ package org.apache.hudi.common.table.log.block; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.fs.inline.InLineFSUtils; +import org.apache.hudi.common.fs.inline.InLineFileSystem; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.storage.HoodieHFileReader; + +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -44,6 +48,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -61,6 +66,7 @@ public class HoodieHFileDataBlock extends HoodieDataBlock { private static final Logger LOG = LogManager.getLogger(HoodieHFileDataBlock.class); private static Compression.Algorithm compressionAlgorithm = Compression.Algorithm.GZ; private static int blockSize = 1 * 1024 * 1024; + private boolean enableInlineReading = false; public HoodieHFileDataBlock(@Nonnull Map logBlockHeader, @Nonnull Map logBlockFooter, @@ -71,10 +77,11 @@ public HoodieHFileDataBlock(@Nonnull Map logBlockHea public HoodieHFileDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema, - Map header, Map footer) { + Map header, Map footer, boolean enableInlineReading) { super(content, inputStream, readBlockLazily, Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header, footer); + this.enableInlineReading = enableInlineReading; } public HoodieHFileDataBlock(@Nonnull List records, @Nonnull Map header) { @@ -141,6 +148,50 @@ protected byte[] serializeRecords() throws IOException { return baos.toByteArray(); } + @Override + protected void createRecordsFromContentBytes() throws IOException { + if (enableInlineReading) { + getRecords(Collections.emptyList()); + } else { + super.createRecordsFromContentBytes(); + } + } + + @Override + public List getRecords(List keys) throws IOException { + readWithInlineFS(keys); + return records; + } + + private void readWithInlineFS(List keys) throws IOException { + boolean enableFullScan = keys.isEmpty(); + // Get schema from the header + Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + // If readerSchema was not present, use writerSchema + if (schema == null) { + schema = writerSchema; + } + Configuration conf = new Configuration(); + CacheConfig cacheConf = new CacheConfig(conf); + Configuration inlineConf = new Configuration(); + inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName()); + + Path inlinePath = InLineFSUtils.getInlineFilePath( + getBlockContentLocation().get().getLogFile().getPath(), + getBlockContentLocation().get().getLogFile().getPath().getFileSystem(conf).getScheme(), + getBlockContentLocation().get().getContentPositionInLogFile(), + getBlockContentLocation().get().getBlockSize()); + if (!enableFullScan) { + // HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks. + Collections.sort(keys); + } + HoodieHFileReader reader = new HoodieHFileReader(inlineConf, inlinePath, cacheConf, inlinePath.getFileSystem(inlineConf)); + List> logRecords = enableFullScan ? reader.readAllRecords(writerSchema, schema) : + reader.readRecords(keys, schema); + reader.close(); + this.records = logRecords.stream().map(t -> t.getSecond()).collect(Collectors.toList()); + } + @Override protected void deserializeRecords() throws IOException { // Get schema from the header diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java index e6abed677af58..37631b0253c0c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -34,14 +35,14 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.Serializable; -import java.text.SimpleDateFormat; +import java.text.ParseException; +import java.time.Instant; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.Objects; import java.util.Set; -import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; /** @@ -58,8 +59,6 @@ */ public class HoodieActiveTimeline extends HoodieDefaultTimeline { - public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss"); - public static final Set VALID_EXTENSIONS_IN_ACTIVE_TIMELINE = new HashSet<>(Arrays.asList( COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, REQUESTED_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION, INFLIGHT_DELTA_COMMIT_EXTENSION, REQUESTED_DELTA_COMMIT_EXTENSION, @@ -67,32 +66,48 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { CLEAN_EXTENSION, REQUESTED_CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, INFLIGHT_COMPACTION_EXTENSION, REQUESTED_COMPACTION_EXTENSION, INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION, - ROLLBACK_EXTENSION, INFLIGHT_ROLLBACK_EXTENSION, + ROLLBACK_EXTENSION, REQUESTED_ROLLBACK_EXTENSION, INFLIGHT_ROLLBACK_EXTENSION, REQUESTED_REPLACE_COMMIT_EXTENSION, INFLIGHT_REPLACE_COMMIT_EXTENSION, REPLACE_COMMIT_EXTENSION)); private static final Logger LOG = LogManager.getLogger(HoodieActiveTimeline.class); protected HoodieTableMetaClient metaClient; - private static AtomicReference lastInstantTime = new AtomicReference<>(String.valueOf(Integer.MIN_VALUE)); /** - * Returns next instant time in the {@link #COMMIT_FORMATTER} format. + * Parse the timestamp of an Instant and return a {@code SimpleDateFormat}. + */ + public static Date parseInstantTime(String timestamp) throws ParseException { + return HoodieInstantTimeGenerator.parseInstantTime(timestamp); + } + + /** + * Format the java.time.Instant to a String representing the timestamp of a Hoodie Instant. + */ + public static String formatInstantTime(Instant timestamp) { + return HoodieInstantTimeGenerator.formatInstantTime(timestamp); + } + + /** + * Format the Date to a String representing the timestamp of a Hoodie Instant. + */ + public static String formatInstantTime(Date timestamp) { + return HoodieInstantTimeGenerator.formatInstantTime(timestamp); + } + + /** + * Returns next instant time in the correct format. * Ensures each instant time is atleast 1 second apart since we create instant times at second granularity */ public static String createNewInstantTime() { - return createNewInstantTime(0); + return HoodieInstantTimeGenerator.createNewInstantTime(0); } /** - * Returns next instant time that adds N milliseconds in the {@link #COMMIT_FORMATTER} format. + * Returns next instant time that adds N milliseconds to current time. * Ensures each instant time is atleast 1 second apart since we create instant times at second granularity + * + * @param milliseconds Milliseconds to add to current time while generating the new instant time */ public static String createNewInstantTime(long milliseconds) { - return lastInstantTime.updateAndGet((oldVal) -> { - String newCommitTime; - do { - newCommitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date(System.currentTimeMillis() + milliseconds)); - } while (HoodieTimeline.compareTimestamps(newCommitTime, LESSER_THAN_OR_EQUALS, oldVal)); - return newCommitTime; - }); + return HoodieInstantTimeGenerator.createNewInstantTime(milliseconds); } protected HoodieActiveTimeline(HoodieTableMetaClient metaClient, Set includedExtensions) { @@ -128,6 +143,7 @@ public HoodieActiveTimeline(HoodieTableMetaClient metaClient, boolean applyLayou * * @deprecated */ + @Deprecated public HoodieActiveTimeline() { } @@ -136,6 +152,7 @@ public HoodieActiveTimeline() { * * @deprecated */ + @Deprecated private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); } @@ -172,6 +189,14 @@ public void deletePending(HoodieInstant instant) { deleteInstantFile(instant); } + public static void deleteInstantFile(FileSystem fs, String metaPath, HoodieInstant instant) { + try { + fs.delete(new Path(metaPath, instant.getFileName()), false); + } catch (IOException e) { + throw new HoodieIOException("Could not delete instant file" + instant.getFileName(), e); + } + } + public void deletePendingIfExists(HoodieInstant.State state, String action, String instantStr) { HoodieInstant instant = new HoodieInstant(state, action, instantStr); ValidationUtils.checkArgument(!instant.isCompleted()); @@ -229,6 +254,11 @@ public Option readCleanerInfoAsBytes(HoodieInstant instant) { return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName())); } + public Option readRollbackInfoAsBytes(HoodieInstant instant) { + // Rollback metadata are always stored only in timeline .hoodie + return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName())); + } + //----------------------------------------------------------------- // BEGIN - COMPACTION RELATED META-DATA MANAGEMENT. //----------------------------------------------------------------- @@ -339,6 +369,37 @@ public HoodieInstant transitionCleanRequestedToInflight(HoodieInstant requestedI return inflight; } + /** + * Transition Rollback State from inflight to Committed. + * + * @param inflightInstant Inflight instant + * @param data Extra Metadata + * @return commit instant + */ + public HoodieInstant transitionRollbackInflightToComplete(HoodieInstant inflightInstant, Option data) { + ValidationUtils.checkArgument(inflightInstant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)); + ValidationUtils.checkArgument(inflightInstant.isInflight()); + HoodieInstant commitInstant = new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, inflightInstant.getTimestamp()); + // Then write to timeline + transitionState(inflightInstant, commitInstant, data); + return commitInstant; + } + + /** + * Transition Rollback State from requested to inflight. + * + * @param requestedInstant requested instant + * @param data Optional data to be stored + * @return commit instant + */ + public HoodieInstant transitionRollbackRequestedToInflight(HoodieInstant requestedInstant, Option data) { + ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)); + ValidationUtils.checkArgument(requestedInstant.isRequested()); + HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, ROLLBACK_ACTION, requestedInstant.getTimestamp()); + transitionState(requestedInstant, inflight, data); + return inflight; + } + /** * Transition replace requested file to replace inflight. * @@ -460,6 +521,11 @@ private void revertCompleteToInflight(HoodieInstant completed, HoodieInstant inf } } + public void transitionRequestedToInflight(String commitType, String inFlightInstant) { + HoodieInstant requested = new HoodieInstant(HoodieInstant.State.REQUESTED, commitType, inFlightInstant); + transitionRequestedToInflight(requested, Option.empty(), false); + } + public void transitionRequestedToInflight(HoodieInstant requested, Option content) { transitionRequestedToInflight(requested, content, false); } @@ -497,6 +563,13 @@ public void saveToCleanRequested(HoodieInstant instant, Option content) createFileInMetaPath(instant.getFileName(), content, false); } + public void saveToRollbackRequested(HoodieInstant instant, Option content) { + ValidationUtils.checkArgument(instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)); + ValidationUtils.checkArgument(instant.getState().equals(State.REQUESTED)); + // Plan is stored in meta path + createFileInMetaPath(instant.getFileName(), content, false); + } + private void createFileInMetaPath(String filename, Option content, boolean allowOverwrite) { Path fullPath = new Path(metaClient.getMetaPath(), filename); if (allowOverwrite || metaClient.getTimelineLayoutVersion().isNullVersion()) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java index 6b05eddcc551b..4926b2a55ffa6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java @@ -41,6 +41,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; @@ -208,9 +209,8 @@ private List loadInstants(TimeRangeFilter filter, boolean loadIns List instantsInRange = new ArrayList<>(); for (FileStatus fs : fsStatuses) { //read the archived file - HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), - new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); - try { + try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), + new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) { int instantsInPreviousFile = instantsInRange.size(); //read the avro blocks while (reader.hasNext()) { @@ -220,8 +220,8 @@ private List loadInstants(TimeRangeFilter filter, boolean loadIns List records = blk.getRecords(); // filter blocks in desired time window Stream instantsInBlkStream = records.stream() - .filter(r -> commitsFilter.apply((GenericRecord) r)) - .map(r -> readCommit((GenericRecord) r, loadInstantDetails)); + .filter(r -> commitsFilter.apply((GenericRecord) r)) + .map(r -> readCommit((GenericRecord) r, loadInstantDetails)); if (filter != null) { instantsInBlkStream = instantsInBlkStream.filter(filter::isInRange); @@ -238,11 +238,10 @@ private List loadInstants(TimeRangeFilter filter, boolean loadIns break; } } - } finally { - reader.close(); } } + Collections.sort(instantsInRange); return instantsInRange; } catch (IOException e) { throw new HoodieIOException( diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java index d4bf2dd209f5d..552adfa8f3490 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java @@ -229,7 +229,14 @@ public HoodieTimeline getCleanerTimeline() { */ public HoodieTimeline getRollbackTimeline() { return new HoodieDefaultTimeline(filterInstantsByAction(ROLLBACK_ACTION), - (Function> & Serializable) this::getInstantDetails); + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get only the rollback and restore action (inflight and completed) in the active timeline. + */ + public HoodieTimeline getRollbackAndRestoreTimeline() { + return getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION, RESTORE_ACTION)); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java index 65376b48e07c5..a8df62c6496ae 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java @@ -147,7 +147,8 @@ public String getFileName() { : HoodieTimeline.makeCleanerFileName(timestamp); } else if (HoodieTimeline.ROLLBACK_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightRollbackFileName(timestamp) - : HoodieTimeline.makeRollbackFileName(timestamp); + : isRequested() ? HoodieTimeline.makeRequestedRollbackFileName(timestamp) + : HoodieTimeline.makeRollbackFileName(timestamp); } else if (HoodieTimeline.SAVEPOINT_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightSavePointFileName(timestamp) : HoodieTimeline.makeSavePointFileName(timestamp); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java new file mode 100644 index 0000000000000..817b39254ef05 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.timeline; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.time.temporal.TemporalAccessor; +import java.util.Date; +import java.util.concurrent.atomic.AtomicReference; + +/** + * Utility class to generate and parse timestamps used in Instants. + */ +public class HoodieInstantTimeGenerator { + // Format of the timestamp used for an Instant + private static final String INSTANT_TIMESTAMP_FORMAT = "yyyyMMddHHmmss"; + // Formatter to generate Instant timestamps + private static DateTimeFormatter INSTANT_TIME_FORMATTER = DateTimeFormatter.ofPattern(INSTANT_TIMESTAMP_FORMAT); + // The last Instant timestamp generated + private static AtomicReference lastInstantTime = new AtomicReference<>(String.valueOf(Integer.MIN_VALUE)); + private static final String ALL_ZERO_TIMESTAMP = "00000000000000"; + + /** + * Returns next instant time that adds N milliseconds to the current time. + * Ensures each instant time is atleast 1 second apart since we create instant times at second granularity + * + * @param milliseconds Milliseconds to add to current time while generating the new instant time + */ + public static String createNewInstantTime(long milliseconds) { + return lastInstantTime.updateAndGet((oldVal) -> { + String newCommitTime; + do { + Date d = new Date(System.currentTimeMillis() + milliseconds); + newCommitTime = INSTANT_TIME_FORMATTER.format(convertDateToTemporalAccessor(d)); + } while (HoodieTimeline.compareTimestamps(newCommitTime, HoodieActiveTimeline.LESSER_THAN_OR_EQUALS, oldVal)); + return newCommitTime; + }); + } + + public static Date parseInstantTime(String timestamp) { + try { + LocalDateTime dt = LocalDateTime.parse(timestamp, INSTANT_TIME_FORMATTER); + return Date.from(dt.atZone(ZoneId.systemDefault()).toInstant()); + } catch (DateTimeParseException e) { + // Special handling for all zero timestamp which is not parsable by DateTimeFormatter + if (timestamp.equals(ALL_ZERO_TIMESTAMP)) { + return new Date(0); + } + + throw e; + } + } + + public static String formatInstantTime(Instant timestamp) { + return INSTANT_TIME_FORMATTER.format(timestamp); + } + + public static String formatInstantTime(Date timestamp) { + return INSTANT_TIME_FORMATTER.format(convertDateToTemporalAccessor(timestamp)); + } + + private static TemporalAccessor convertDateToTemporalAccessor(Date d) { + return d.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java index 1e366147a6047..b473c7b1fb4d1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java @@ -73,6 +73,7 @@ public interface HoodieTimeline extends Serializable { String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION; String REQUESTED_CLEAN_EXTENSION = "." + CLEAN_ACTION + REQUESTED_EXTENSION; String INFLIGHT_ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION + INFLIGHT_EXTENSION; + String REQUESTED_ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION + REQUESTED_EXTENSION; String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION; String REQUESTED_COMPACTION_SUFFIX = StringUtils.join(COMPACTION_ACTION, REQUESTED_EXTENSION); String REQUESTED_COMPACTION_EXTENSION = StringUtils.join(".", REQUESTED_COMPACTION_SUFFIX); @@ -363,6 +364,10 @@ static String makeRollbackFileName(String instant) { return StringUtils.join(instant, HoodieTimeline.ROLLBACK_EXTENSION); } + static String makeRequestedRollbackFileName(String instant) { + return StringUtils.join(instant, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION); + } + static String makeInflightRollbackFileName(String instant) { return StringUtils.join(instant, HoodieTimeline.INFLIGHT_ROLLBACK_EXTENSION); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java index a50c2998a19e1..32e42ee58ac27 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java @@ -28,6 +28,7 @@ import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.avro.model.HoodieSavepointPartitionMetadata; import org.apache.hudi.common.HoodieRollbackStat; @@ -109,6 +110,10 @@ public static Option serializeCleanerPlan(HoodieCleanerPlan cleanPlan) t return serializeAvroMetadata(cleanPlan, HoodieCleanerPlan.class); } + public static Option serializeRollbackPlan(HoodieRollbackPlan rollbackPlan) throws IOException { + return serializeAvroMetadata(rollbackPlan, HoodieRollbackPlan.class); + } + public static Option serializeCleanMetadata(HoodieCleanMetadata metadata) throws IOException { return serializeAvroMetadata(metadata, HoodieCleanMetadata.class); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index 01122fdc5501c..eca3718f18a03 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -285,9 +285,7 @@ private void ensurePartitionLoadedCorrectly(String partition) { try { LOG.info("Building file system view for partition (" + partitionPathStr + ")"); - // Create the path if it does not exist already Path partitionPath = FSUtils.getPartitionPath(metaClient.getBasePath(), partitionPathStr); - FSUtils.createPathIfNotExists(metaClient.getFs(), partitionPath); long beginLsTs = System.currentTimeMillis(); FileStatus[] statuses = listPartition(partitionPath); long endLsTs = System.currentTimeMillis(); @@ -317,7 +315,13 @@ private void ensurePartitionLoadedCorrectly(String partition) { * @throws IOException */ protected FileStatus[] listPartition(Path partitionPath) throws IOException { - return metaClient.getFs().listStatus(partitionPath); + // Create the path if it does not exist already + if (!metaClient.getFs().exists(partitionPath)) { + metaClient.getFs().mkdirs(partitionPath); + return new FileStatus[0]; + } else { + return metaClient.getFs().listStatus(partitionPath); + } } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java index 23b0536c240dc..4c21530102072 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java @@ -479,7 +479,7 @@ public HoodieTimeline getTimeline() { @Override public void sync() { - // noop + refresh(); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java index 0f1f49fedc860..de2e345a86989 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java @@ -52,6 +52,7 @@ import org.apache.orc.storage.serde2.io.DateWritable; import org.apache.hudi.exception.HoodieIOException; import org.apache.orc.TypeDescription; +import static org.apache.avro.JsonProperties.NULL_VALUE; /** * Methods including addToVector, addUnionValue, createOrcSchema are originally from @@ -796,4 +797,78 @@ private static Schema getActualSchemaType(Schema unionSchema) { return Schema.createUnion(nonNullMembers); } } + + public static Schema createAvroSchemaWithDefaultValue(TypeDescription orcSchema, String recordName, String namespace, boolean nullable) { + Schema avroSchema = createAvroSchemaWithNamespace(orcSchema,recordName,namespace); + List fields = new ArrayList(); + List fieldList = avroSchema.getFields(); + for (Field field : fieldList) { + Schema fieldSchema = field.schema(); + Schema nullableSchema = Schema.createUnion(Schema.create(Schema.Type.NULL),fieldSchema); + if (nullable) { + fields.add(new Schema.Field(field.name(), nullableSchema, null, NULL_VALUE)); + } else { + fields.add(new Schema.Field(field.name(), fieldSchema, null, (Object) null)); + } + } + Schema schema = Schema.createRecord(recordName, null, null, false); + schema.setFields(fields); + return schema; + } + + private static Schema createAvroSchemaWithNamespace(TypeDescription orcSchema, String recordName, String namespace) { + switch (orcSchema.getCategory()) { + case BOOLEAN: + return Schema.create(Schema.Type.BOOLEAN); + case BYTE: + // tinyint (8 bit), use int to hold it + return Schema.create(Schema.Type.INT); + case SHORT: + // smallint (16 bit), use int to hold it + return Schema.create(Schema.Type.INT); + case INT: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but there is no way to distinguish + return Schema.create(Schema.Type.INT); + case LONG: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but there is no way to distinguish + return Schema.create(Schema.Type.LONG); + case FLOAT: + return Schema.create(Schema.Type.FLOAT); + case DOUBLE: + return Schema.create(Schema.Type.DOUBLE); + case VARCHAR: + case CHAR: + case STRING: + return Schema.create(Schema.Type.STRING); + case DATE: + Schema date = Schema.create(Schema.Type.INT); + LogicalTypes.date().addToSchema(date); + return date; + case TIMESTAMP: + Schema timestamp = Schema.create(Schema.Type.LONG); + LogicalTypes.timestampMillis().addToSchema(timestamp); + return timestamp; + case BINARY: + return Schema.create(Schema.Type.BYTES); + case DECIMAL: + Schema decimal = Schema.create(Schema.Type.BYTES); + LogicalTypes.decimal(orcSchema.getPrecision(), orcSchema.getScale()).addToSchema(decimal); + return decimal; + case LIST: + return Schema.createArray(createAvroSchemaWithNamespace(orcSchema.getChildren().get(0), recordName, "")); + case MAP: + return Schema.createMap(createAvroSchemaWithNamespace(orcSchema.getChildren().get(1), recordName, "")); + case STRUCT: + List childFields = new ArrayList<>(); + for (int i = 0; i < orcSchema.getChildren().size(); i++) { + TypeDescription childType = orcSchema.getChildren().get(i); + String childName = orcSchema.getFieldNames().get(i); + childFields.add(new Field(childName, createAvroSchemaWithNamespace(childType, childName, ""), null, null)); + } + return Schema.createRecord(recordName, null, namespace, false, childFields); + default: + throw new IllegalStateException(String.format("Unrecognized ORC type: %s", orcSchema.getCategory().getName())); + + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java index a86879ad6305e..97e9133cfa51c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java @@ -26,11 +26,11 @@ public final class Base64CodecUtil { /** * Decodes data from the input string into using the encoding scheme. * - * @param serString + * @param encodedString - Base64 encoded string to decode * @return A newly-allocated byte array containing the decoded bytes. */ - public static byte[] decode(String serString) { - return Base64.getDecoder().decode(serString.getBytes(StandardCharsets.UTF_8)); + public static byte[] decode(String encodedString) { + return Base64.getDecoder().decode(encodedString.getBytes(StandardCharsets.UTF_8)); } /** diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ClosableIterator.java similarity index 68% rename from hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java rename to hudi-common/src/main/java/org/apache/hudi/common/util/ClosableIterator.java index b84ce6dd552f6..9e1d0c2b2b954 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ClosableIterator.java @@ -16,15 +16,16 @@ * limitations under the License. */ -package org.apache.hudi.table.upgrade; +package org.apache.hudi.common.util; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; - -public class OneToTwoUpgradeHandler extends BaseOneToTwoUpgradeHandler { +import java.util.Iterator; +/** + * An iterator that give a chance to release resources. + * + * @param The return type + */ +public interface ClosableIterator extends Iterator, AutoCloseable { @Override - String getPartitionColumns(HoodieWriteConfig config) { - return config.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()); - } + void close(); // override to not throw exception } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java index bf2473913585d..e418043fe0ecd 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java @@ -19,8 +19,6 @@ package org.apache.hudi.common.util; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -50,8 +48,6 @@ import org.apache.orc.RecordReader; import org.apache.orc.TypeDescription; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_SCHEMA_METADATA_KEY; - /** * Utility functions for ORC files. */ @@ -226,9 +222,8 @@ public Map readFooter(Configuration conf, boolean required, public Schema readAvroSchema(Configuration conf, Path orcFilePath) { try { Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf)); - ByteBuffer schemaBuffer = reader.getMetadataValue(HOODIE_AVRO_SCHEMA_METADATA_KEY); - String schemaText = StandardCharsets.UTF_8.decode(schemaBuffer).toString(); - return new Schema.Parser().parse(schemaText); + TypeDescription orcSchema = reader.getSchema(); + return AvroOrcUtils.createAvroSchema(orcSchema); } catch (IOException io) { throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java index 20c79dd78e130..5970e02d6799a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java @@ -19,7 +19,7 @@ package org.apache.hudi.common.util; import org.apache.hudi.common.util.queue.BoundedInMemoryQueue; -import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieException; import org.apache.parquet.hadoop.ParquetReader; @@ -49,8 +49,9 @@ public boolean hasNext() { this.next = parquetReader.read(); } return this.next != null; - } catch (IOException io) { - throw new HoodieIOException("unable to read next record from parquet file ", io); + } catch (Exception e) { + FileIOUtils.closeQuietly(parquetReader); + throw new HoodieException("unable to read next record from parquet file ", e); } } @@ -60,14 +61,15 @@ public T next() { // To handle case when next() is called before hasNext() if (this.next == null) { if (!hasNext()) { - throw new HoodieIOException("No more records left to read from parquet file"); + throw new HoodieException("No more records left to read from parquet file"); } } T retVal = this.next; this.next = parquetReader.read(); return retVal; - } catch (IOException io) { - throw new HoodieIOException("unable to read next record from parquet file ", io); + } catch (Exception e) { + FileIOUtils.closeQuietly(parquetReader); + throw new HoodieException("unable to read next record from parquet file ", e); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index ebe361025991c..c142e8a9608be 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.exception.HoodieIOException; @@ -41,12 +42,14 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.Function; +import java.util.stream.Collectors; /** * Utility functions involving with parquet. @@ -277,4 +280,59 @@ public Boolean apply(String recordKey) { return candidateKeys.contains(recordKey); } } + + /** + * Parse min/max statistics stored in parquet footers for all columns. + */ + public Collection> readRangeFromParquetMetadata(Configuration conf, Path parquetFilePath, List cols) { + ParquetMetadata metadata = readMetadata(conf, parquetFilePath); + // collect stats from all parquet blocks + Map>> columnToStatsListMap = metadata.getBlocks().stream().flatMap(blockMetaData -> { + return blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData -> + new HoodieColumnRangeMetadata<>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(), + columnChunkMetaData.getStatistics().genericGetMin(), + columnChunkMetaData.getStatistics().genericGetMax(), + columnChunkMetaData.getStatistics().getNumNulls(), + columnChunkMetaData.getPrimitiveType().stringifier())); + }).collect(Collectors.groupingBy(e -> e.getColumnName())); + + // we only intend to keep file level statistics. + return new ArrayList<>(columnToStatsListMap.values().stream() + .map(blocks -> getColumnRangeInFile(blocks)) + .collect(Collectors.toList())); + } + + private HoodieColumnRangeMetadata getColumnRangeInFile(final List> blockRanges) { + if (blockRanges.size() == 1) { + // only one block in parquet file. we can just return that range. + return blockRanges.get(0); + } else { + // there are multiple blocks. Compute min(block_mins) and max(block_maxs) + return blockRanges.stream().reduce((b1, b2) -> combineRanges(b1, b2)).get(); + } + } + + private HoodieColumnRangeMetadata combineRanges(HoodieColumnRangeMetadata range1, + HoodieColumnRangeMetadata range2) { + final Comparable minValue; + final Comparable maxValue; + if (range1.getMinValue() != null && range2.getMinValue() != null) { + minValue = range1.getMinValue().compareTo(range2.getMinValue()) < 0 ? range1.getMinValue() : range2.getMinValue(); + } else if (range1.getMinValue() == null) { + minValue = range2.getMinValue(); + } else { + minValue = range1.getMinValue(); + } + + if (range1.getMaxValue() != null && range2.getMaxValue() != null) { + maxValue = range1.getMaxValue().compareTo(range2.getMaxValue()) < 0 ? range2.getMaxValue() : range1.getMaxValue(); + } else if (range1.getMaxValue() == null) { + maxValue = range2.getMaxValue(); + } else { + maxValue = range1.getMaxValue(); + } + + return new HoodieColumnRangeMetadata<>(range1.getFilePath(), + range1.getColumnName(), minValue, maxValue, range1.getNumNulls() + range2.getNumNulls(), range1.getStringifier()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/PartitionPathEncodeUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/PartitionPathEncodeUtils.java index a63a529408c1f..e489143746953 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/PartitionPathEncodeUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/PartitionPathEncodeUtils.java @@ -25,6 +25,8 @@ */ public class PartitionPathEncodeUtils { + public static final String DEFAULT_PARTITION_PATH = "default"; + static BitSet charToEscape = new BitSet(128); static { for (char c = 0; c < ' '; c++) { @@ -64,14 +66,11 @@ public static String escapePathName(String path) { * @return An escaped path name. */ public static String escapePathName(String path, String defaultPath) { - - // __HIVE_DEFAULT_NULL__ is the system default value for null and empty string. - // TODO: we should allow user to specify default partition or HDFS file location. if (path == null || path.length() == 0) { if (defaultPath == null) { - //previously, when path is empty or null and no default path is specified, - // __HIVE_DEFAULT_PARTITION__ was the return value for escapePathName - return "__HIVE_DEFAULT_PARTITION__"; + // previously, when path is empty or null and no default path is specified, + // "default" was the return value for escapePathName + return DEFAULT_PARTITION_PATH; } else { return defaultPath; } @@ -111,4 +110,12 @@ public static String unescapePathName(String path) { } return sb.toString(); } + + public static String escapePartitionValue(String value) { + if (value == null || value.isEmpty()) { + return DEFAULT_PARTITION_PATH; + } else { + return escapePathName(value); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java index 7590e9aced1dd..289901df81861 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.fs.SizeAwareDataOutputStream; import org.apache.hudi.common.util.BufferedRandomAccessFile; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.SerializationUtils; import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.exception.HoodieException; @@ -37,11 +38,12 @@ import java.io.InputStream; import java.io.RandomAccessFile; import java.io.Serializable; -import java.net.InetAddress; import java.util.AbstractMap; +import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Queue; import java.util.Set; @@ -88,6 +90,8 @@ public final class BitCaskDiskMap randomAccessFile = new ThreadLocal<>(); private final Queue openedAccessFiles = new ConcurrentLinkedQueue<>(); + private final List> iterators = new ArrayList<>(); + public BitCaskDiskMap(String baseFilePath, boolean isCompressionEnabled) throws IOException { super(baseFilePath, ExternalSpillableMap.DiskMapType.BITCASK.name()); this.valueMetadataMap = new ConcurrentHashMap<>(); @@ -133,9 +137,7 @@ private void initFile(File writeOnlyFile) throws IOException { writeOnlyFile.getParentFile().mkdir(); } writeOnlyFile.createNewFile(); - LOG.info("Spilling to file location " + writeOnlyFile.getAbsolutePath() + " in host (" - + InetAddress.getLocalHost().getHostAddress() + ") with hostname (" + InetAddress.getLocalHost().getHostName() - + ")"); + LOG.debug("Spilling to file location " + writeOnlyFile.getAbsolutePath()); // Make sure file is deleted when JVM exits writeOnlyFile.deleteOnExit(); } @@ -153,7 +155,9 @@ private void flushToDisk() { */ @Override public Iterator iterator() { - return new LazyFileIterable(filePath, valueMetadataMap, isCompressionEnabled).iterator(); + ClosableIterator iterator = new LazyFileIterable(filePath, valueMetadataMap, isCompressionEnabled).iterator(); + this.iterators.add(iterator); + return iterator; } /** @@ -278,6 +282,7 @@ public void close() { } } writeOnlyFile.delete(); + this.iterators.forEach(ClosableIterator::close); } catch (Exception e) { // delete the file for any sort of exception writeOnlyFile.delete(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java index 33d07d5bca0cc..49d81443151a3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.util.collection; import org.apache.hudi.common.util.BufferedRandomAccessFile; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.exception.HoodieException; import java.io.IOException; @@ -53,7 +54,7 @@ public LazyFileIterable(String filePath, Map ma } @Override - public Iterator iterator() { + public ClosableIterator iterator() { try { return new LazyFileIterator<>(filePath, inMemoryMetadataOfSpilledData); } catch (IOException io) { @@ -64,7 +65,7 @@ public Iterator iterator() { /** * Iterator implementation for the iterable defined above. */ - public class LazyFileIterator implements Iterator { + public class LazyFileIterator implements ClosableIterator { private final String filePath; private BufferedRandomAccessFile readOnlyFileHandle; @@ -111,7 +112,7 @@ public void forEachRemaining(Consumer action) { action.accept(next()); } - private void close() { + public void close() { closeHandle(); Runtime.getRuntime().removeShutdownHook(shutdownThread); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java new file mode 100644 index 0000000000000..be4db44ecd961 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * A stateful Hoodie object ID representing any table column. + */ +public class ColumnID extends HoodieID { + + private static final Type TYPE = Type.COLUMN; + private static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64; + private final byte[] hash; + + public ColumnID(final String message) { + this.hash = HashID.hash(message, ID_COLUMN_HASH_SIZE); + } + + @Override + public int bits() { + return ID_COLUMN_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java new file mode 100644 index 0000000000000..0cb73c5abf9a8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * Hoodie object ID representing any file. + */ +public class FileID extends HoodieID { + + private static final Type TYPE = Type.FILE; + private static final HashID.Size ID_FILE_HASH_SIZE = HashID.Size.BITS_128; + private final byte[] hash; + + public FileID(final String message) { + this.hash = HashID.hash(message, ID_FILE_HASH_SIZE); + } + + @Override + public int bits() { + return ID_FILE_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java new file mode 100644 index 0000000000000..c56d76097866b --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import net.jpountz.xxhash.XXHash32; +import net.jpountz.xxhash.XXHash64; +import net.jpountz.xxhash.XXHashFactory; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hudi.exception.HoodieIOException; + +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +/** + * A stateless Hash class which generates ID for the desired bit count. + */ +public class HashID implements Serializable { + + private static final String MD5_ALGORITHM_NAME = "MD5"; + private static final int HASH_SEED = 0xdabadaba; + + /** + * Represents HashID size in bits. + */ + public enum Size { + BITS_32(32), + BITS_64(64), + BITS_128(128); + + private final int bits; + + Size(int bitCount) { + this.bits = bitCount; + } + + /** + * Get this Hash size in bytes. + * + * @return Bytes needed to represent this size + */ + public int byteSize() { + return (((this.bits - 1) / Byte.SIZE) + 1); + } + + /** + * Get this Hash size in bits. + * + * @return bits needed to represent the size + */ + public int bits() { + return this.bits; + } + + @Override + public String toString() { + return "HashSize{" + bits + "}"; + } + } + + /** + * Get the hash value for a string message and for the desired @{@link Size}. + * + * @param message - String message to get the hash value for + * @param bits - @{@link Size} of the hash value + * @return Hash value for the message as byte array + */ + public static byte[] hash(final String message, final Size bits) { + return hash(message.getBytes(StandardCharsets.UTF_8), bits); + } + + /** + * Get the hash value for a byte array and for the desired @{@link Size}. + * + * @param messageBytes - Byte array message to get the hash value for + * @param bits - @{@link Size} of the hash value + * @return Hash value for the message as byte array + */ + public static byte[] hash(final byte[] messageBytes, final Size bits) { + switch (bits) { + case BITS_32: + case BITS_64: + return getXXHash(messageBytes, bits); + case BITS_128: + return getMD5Hash(messageBytes); + default: + throw new IllegalArgumentException("Unexpected Hash size bits: " + bits); + } + } + + private static byte[] getXXHash(final byte[] message, final Size bits) { + XXHashFactory factory = XXHashFactory.fastestInstance(); + switch (bits) { + case BITS_32: + XXHash32 hash32 = factory.hash32(); + return Bytes.toBytes(hash32.hash(message, 0, message.length, HASH_SEED)); + case BITS_64: + XXHash64 hash64 = factory.hash64(); + return Bytes.toBytes(hash64.hash(message, 0, message.length, HASH_SEED)); + default: + throw new HoodieIOException("XX" + bits + " hash is unsupported!"); + } + } + + private static byte[] getMD5Hash(final byte[] message) throws HoodieIOException { + try { + MessageDigest messageDigest = MessageDigest.getInstance(MD5_ALGORITHM_NAME); + messageDigest.update(message); + return messageDigest.digest(); + } catch (NoSuchAlgorithmException e) { + throw new HoodieIOException("Failed to create MD5 Hash: " + e); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java new file mode 100644 index 0000000000000..e08e254b0a215 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.exception.HoodieNotSupportedException; + +import java.io.Serializable; + +/** + * A serializable ID that can be used to identify any Hoodie table fields and resources. + */ +public abstract class HoodieID implements Serializable { + + private static final long serialVersionUID = 1L; + + /** + * Supported ID types. + */ + public enum Type { + COLUMN("HoodieColumnID"), + PARTITION("HoodiePartitionID"), + FILE("HoodieFileID"); + + private final String name; + + Type(final String name) { + this.name = name; + } + + @Override + public String toString() { + return "Type{name='" + name + "'}"; + } + } + + /** + * Get the number of bits representing this ID in memory. + *

+ * Note: Will be in multiples of 8 only. + * + * @return The number of bits in this ID + */ + public abstract int bits(); + + /** + * Get this ID as a byte array. + * + * @return A byte array representing this ID + */ + public abstract byte[] asBytes(); + + /** + * Get the String version of this ID. + * + * @return String version of this ID. + */ + public abstract String toString(); + + /** + * + */ + public String asBase64EncodedString() { + throw new HoodieNotSupportedException("Unsupported hash for " + getType()); + } + + /** + * Get the ID type. + * + * @return This ID type + */ + protected abstract Type getType(); + + /** + * Is this ID a ColumnID type ? + * + * @return True if this ID of ColumnID type + */ + public final boolean isColumnID() { + return (getType() == Type.COLUMN); + } + + /** + * Is this ID a Partition type ? + * + * @return True if this ID of PartitionID type + */ + public final boolean isPartition() { + return (getType() == Type.PARTITION); + } + + /** + * Is this ID a FileID type ? + * + * @return True if this ID of FileID type + */ + public final boolean isFileID() { + return (getType() == Type.FILE); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java new file mode 100644 index 0000000000000..f31159faa2a2f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * Hoodie object ID representing any partition. + */ +public class PartitionID extends HoodieID { + + private static final Type TYPE = Type.PARTITION; + private static final HashID.Size ID_PARTITION_HASH_SIZE = HashID.Size.BITS_64; + private final byte[] hash; + + public PartitionID(final String message) { + this.hash = HashID.hash(message, ID_PARTITION_HASH_SIZE); + } + + @Override + public int bits() { + return ID_PARTITION_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java index 4d55249d1c312..dfe33b49ec0c7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java @@ -172,7 +172,7 @@ private void adjustBufferSizeIfNeeded(final O payload) throws InterruptedExcepti /** * Inserts record into queue after applying transformation. * - * @param t Item to be queueed + * @param t Item to be queued */ public void insertRecord(I t) throws Exception { // If already closed, throw exception @@ -222,7 +222,7 @@ private Option readNextRecord() { throw new HoodieException(e); } } - // Check one more time here as it is possible producer errored out and closed immediately + // Check one more time here as it is possible producer erred out and closed immediately throwExceptionIfFailed(); if (newRecord != null && newRecord.isPresent()) { @@ -244,6 +244,7 @@ public void close() { private void throwExceptionIfFailed() { if (this.hasFailed.get() != null) { + close(); throw new HoodieException("operation has failed", this.hasFailed.get()); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java similarity index 66% rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java rename to hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java index 1f4c2038283bc..579ae21d3ed99 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java @@ -16,18 +16,18 @@ * limitations under the License. */ -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.HoodieSparkUtils; -import org.apache.hudi.config.HoodieWriteConfig; +package org.apache.hudi.exception; /** - * Upgrade handle to assist in upgrading hoodie table from version 1 to 2. + * Exception for incompatible schema. */ -public class OneToTwoUpgradeHandler extends BaseOneToTwoUpgradeHandler { +public class HoodieIncompatibleSchemaException extends Exception { + + public HoodieIncompatibleSchemaException(String msg, Throwable e) { + super(msg, e); + } - @Override - String getPartitionColumns(HoodieWriteConfig config) { - return HoodieSparkUtils.getPartitionColumns(config.getProps()); + public HoodieIncompatibleSchemaException(String msg) { + super(msg); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java index b954e57e77c7e..7b80d1a585974 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java @@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; @@ -33,6 +34,7 @@ import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PositionedReadable; import org.apache.hadoop.fs.Seekable; @@ -55,6 +57,7 @@ public class HoodieHFileReader implements HoodieFileRea private Path path; private Configuration conf; private HFile.Reader reader; + private FSDataInputStream fsDataInputStream; private Schema schema; // Scanner used to read individual keys. This is cached to prevent the overhead of opening the scanner for each // key retrieval. @@ -72,6 +75,13 @@ public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cac this.reader = HFile.createReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, conf); } + public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig, FileSystem inlineFs) throws IOException { + this.conf = configuration; + this.path = path; + this.fsDataInputStream = inlineFs.open(path); + this.reader = HFile.createReader(inlineFs, path, cacheConfig, configuration); + } + public HoodieHFileReader(byte[] content) throws IOException { Configuration conf = new Configuration(); Path path = new Path("hoodie"); @@ -164,6 +174,25 @@ public List> readAllRecords() throws IOException { return readAllRecords(schema, schema); } + public List> readRecords(List keys) throws IOException { + reader.loadFileInfo(); + Schema schema = new Schema.Parser().parse(new String(reader.loadFileInfo().get(KEY_SCHEMA.getBytes()))); + return readRecords(keys, schema); + } + + public List> readRecords(List keys, Schema schema) throws IOException { + this.schema = schema; + reader.loadFileInfo(); + List> records = new ArrayList<>(); + for (String key: keys) { + Option value = getRecordByKey(key, schema); + if (value.isPresent()) { + records.add(new Pair(key, value.get())); + } + } + return records; + } + @Override public Iterator getRecordIterator(Schema readerSchema) throws IOException { final HFileScanner scanner = reader.getScanner(false, false); @@ -217,7 +246,7 @@ public Option getRecordByKey(String key, Schema readerSchema) throws IOException synchronized (this) { if (keyScanner == null) { - keyScanner = reader.getScanner(true, true); + keyScanner = reader.getScanner(false, true); } if (keyScanner.seekTo(kv) == 0) { @@ -250,6 +279,9 @@ public synchronized void close() { try { reader.close(); reader = null; + if (fsDataInputStream != null) { + fsDataInputStream.close(); + } keyScanner = null; } catch (IOException e) { throw new HoodieIOException("Error closing the hfile reader", e); diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java index 44850b9e5acfa..b560b76941322 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java @@ -26,14 +26,11 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hadoop.fs.FileStatus; @@ -59,22 +56,21 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata { protected final transient HoodieEngineContext engineContext; protected final SerializableConfiguration hadoopConf; - protected final String datasetBasePath; - protected final HoodieTableMetaClient datasetMetaClient; + protected final String dataBasePath; + protected final HoodieTableMetaClient dataMetaClient; protected final Option metrics; protected final HoodieMetadataConfig metadataConfig; // Directory used for Spillable Map when merging records protected final String spillableMapDirectory; protected boolean enabled; - private TimelineMergedTableMetadata timelineMergedMetadata; protected BaseTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, - String datasetBasePath, String spillableMapDirectory) { + String dataBasePath, String spillableMapDirectory) { this.engineContext = engineContext; this.hadoopConf = new SerializableConfiguration(engineContext.getHadoopConf()); - this.datasetBasePath = datasetBasePath; - this.datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(datasetBasePath).build(); + this.dataBasePath = dataBasePath; + this.dataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(dataBasePath).build(); this.spillableMapDirectory = spillableMapDirectory; this.metadataConfig = metadataConfig; @@ -104,7 +100,7 @@ public List getAllPartitionPaths() throws IOException { throw new HoodieMetadataException("Failed to retrieve list of partition from metadata", e); } } - return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, datasetBasePath, + return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, dataBasePath, metadataConfig.shouldAssumeDatePartitioning()).getAllPartitionPaths(); } @@ -129,28 +125,25 @@ public FileStatus[] getAllFilesInPartition(Path partitionPath) } } - return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, datasetBasePath, metadataConfig.shouldAssumeDatePartitioning()) + return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, dataBasePath, metadataConfig.shouldAssumeDatePartitioning()) .getAllFilesInPartition(partitionPath); } @Override - public Map getAllFilesInPartitions(List partitionPaths) + public Map getAllFilesInPartitions(List partitions) throws IOException { if (enabled) { - Map partitionsFilesMap = new HashMap<>(); - try { - for (String partitionPath : partitionPaths) { - partitionsFilesMap.put(partitionPath, fetchAllFilesInPartition(new Path(partitionPath))); - } + List partitionPaths = partitions.stream().map(entry -> new Path(entry)).collect(Collectors.toList()); + Map partitionsFilesMap = fetchAllFilesInPartitionPaths(partitionPaths); return partitionsFilesMap; } catch (Exception e) { throw new HoodieMetadataException("Failed to retrieve files in partition from metadata", e); } } - return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, datasetBasePath, metadataConfig.shouldAssumeDatePartitioning()) - .getAllFilesInPartitions(partitionPaths); + return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, dataBasePath, metadataConfig.shouldAssumeDatePartitioning()) + .getAllFilesInPartitions(partitions); } /** @@ -158,7 +151,7 @@ public Map getAllFilesInPartitions(List partitionP */ protected List fetchAllPartitionPaths() throws IOException { HoodieTimer timer = new HoodieTimer().startTimer(); - Option> hoodieRecord = getMergedRecordByKey(RECORDKEY_PARTITION_LIST); + Option> hoodieRecord = getRecordByKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.partitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_PARTITIONS_STR, timer.endTimer())); List partitions = Collections.emptyList(); @@ -176,28 +169,6 @@ protected List fetchAllPartitionPaths() throws IOException { } } - if (metadataConfig.validateFileListingMetadata()) { - // Validate the Metadata Table data by listing the partitions from the file system - timer.startTimer(); - FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(getEngineContext(), - hadoopConf, datasetBasePath, metadataConfig.shouldAssumeDatePartitioning()); - List actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths(); - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer())); - - Collections.sort(actualPartitions); - Collections.sort(partitions); - if (!actualPartitions.equals(partitions)) { - LOG.error("Validation of metadata partition list failed. Lists do not match."); - LOG.error("Partitions from metadata: " + Arrays.toString(partitions.toArray())); - LOG.error("Partitions from file system: " + Arrays.toString(actualPartitions.toArray())); - - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_ERRORS_STR, 0)); - } - - // Return the direct listing as it should be correct - partitions = actualPartitions; - } - LOG.info("Listed partitions from metadata: #partitions=" + partitions.size()); return partitions; } @@ -208,13 +179,13 @@ protected List fetchAllPartitionPaths() throws IOException { * @param partitionPath The absolute path of the partition */ FileStatus[] fetchAllFilesInPartition(Path partitionPath) throws IOException { - String partitionName = FSUtils.getRelativePartitionPath(new Path(datasetBasePath), partitionPath); + String partitionName = FSUtils.getRelativePartitionPath(new Path(dataBasePath), partitionPath); if (partitionName.isEmpty()) { partitionName = NON_PARTITIONED_NAME; } HoodieTimer timer = new HoodieTimer().startTimer(); - Option> hoodieRecord = getMergedRecordByKey(partitionName); + Option> hoodieRecord = getRecordByKey(partitionName, MetadataPartitionType.FILES.partitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); FileStatus[] statuses = {}; @@ -226,101 +197,52 @@ FileStatus[] fetchAllFilesInPartition(Path partitionPath) throws IOException { statuses = hoodieRecord.get().getData().getFileStatuses(hadoopConf.get(), partitionPath); } - if (metadataConfig.validateFileListingMetadata()) { - // Validate the Metadata Table data by listing the partitions from the file system - timer.startTimer(); - - String partitionPathStr = FSUtils.getRelativePartitionPath(new Path(datasetMetaClient.getBasePath()), partitionPath); - String latestDataInstantTime = getLatestDatasetInstantTime(); - HoodieTableFileSystemView dataFsView = new HoodieTableFileSystemView(datasetMetaClient, datasetMetaClient.getActiveTimeline()); - List directStatuses = dataFsView.getAllFileSlices(partitionPathStr).flatMap(slice -> { - List paths = new ArrayList<>(); - slice.getBaseFile().ifPresent(baseFile -> { - if (HoodieTimeline.compareTimestamps(baseFile.getCommitTime(), HoodieTimeline.LESSER_THAN_OR_EQUALS, latestDataInstantTime)) { - paths.add(baseFile.getFileStatus()); - } - }); - //TODO(metadata): this will remain problematic; no way to know the commit time based on log file written - slice.getLogFiles().forEach(logFile -> paths.add(logFile.getFileStatus())); - return paths.stream(); - }).collect(Collectors.toList()); - - List directFilenames = directStatuses.stream() - .map(fileStatus -> fileStatus.getPath().getName()).sorted() - .collect(Collectors.toList()); - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_FILES_STR, timer.endTimer())); - - List metadataFilenames = Arrays.stream(statuses) - .map(s -> s.getPath().getName()).sorted() - .collect(Collectors.toList()); - - if (!metadataFilenames.equals(directFilenames)) { - LOG.error("Validation of metadata file listing for partition " + partitionName + " failed."); - LOG.error("File list from metadata: " + Arrays.toString(metadataFilenames.toArray())); - LOG.error("File list from direct listing: " + Arrays.toString(directFilenames.toArray())); - - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_ERRORS_STR, 0)); - } - - // Return the direct listing as it should be correct - statuses = directStatuses.toArray(new FileStatus[0]); - } - LOG.info("Listed file in partition from metadata: partition=" + partitionName + ", #files=" + statuses.length); return statuses; } - /** - * Retrieve the merged {@code HoodieRecord} mapped to the given key. - * - * @param key The key of the record - */ - private Option> getMergedRecordByKey(String key) { - Option> mergedRecord; - Option> metadataHoodieRecord = getRecordByKeyFromMetadata(key); - // Retrieve record from unsynced timeline instants - Option> timelineHoodieRecord = timelineMergedMetadata.getRecordByKey(key); - if (timelineHoodieRecord.isPresent()) { - if (metadataHoodieRecord.isPresent()) { - HoodieRecordPayload mergedPayload = timelineHoodieRecord.get().getData().preCombine(metadataHoodieRecord.get().getData()); - mergedRecord = Option.of(new HoodieRecord(metadataHoodieRecord.get().getKey(), mergedPayload)); + Map fetchAllFilesInPartitionPaths(List partitionPaths) throws IOException { + Map partitionInfo = new HashMap<>(); + boolean foundNonPartitionedPath = false; + for (Path partitionPath: partitionPaths) { + String partitionName = FSUtils.getRelativePartitionPath(new Path(dataBasePath), partitionPath); + if (partitionName.isEmpty()) { + if (partitionInfo.size() > 1) { + throw new HoodieMetadataException("Found mix of partitioned and non partitioned paths while fetching data from metadata table"); + } + partitionInfo.put(NON_PARTITIONED_NAME, partitionPath); + foundNonPartitionedPath = true; } else { - mergedRecord = timelineHoodieRecord; + if (foundNonPartitionedPath) { + throw new HoodieMetadataException("Found mix of partitioned and non partitioned paths while fetching data from metadata table"); + } + partitionInfo.put(partitionName, partitionPath); } - } else { - mergedRecord = metadataHoodieRecord; } - return mergedRecord; - } - protected abstract Option> getRecordByKeyFromMetadata(String key); + HoodieTimer timer = new HoodieTimer().startTimer(); + List>>> partitionsFileStatus = + getRecordsByKeys(new ArrayList<>(partitionInfo.keySet()), MetadataPartitionType.FILES.partitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); + Map result = new HashMap<>(); - protected void openTimelineScanner(HoodieActiveTimeline metadataTableTimeline) { - if (timelineMergedMetadata == null) { - List unSyncedInstants = findInstantsToSyncForReader(); - timelineMergedMetadata = - new TimelineMergedTableMetadata(datasetMetaClient, metadataTableTimeline, unSyncedInstants, getUpdateTime(), null); + for (Pair>> entry: partitionsFileStatus) { + if (entry.getValue().isPresent()) { + if (!entry.getValue().get().getData().getDeletions().isEmpty()) { + throw new HoodieMetadataException("Metadata record for partition " + entry.getKey() + " is inconsistent: " + + entry.getValue().get().getData()); + } + result.put(partitionInfo.get(entry.getKey()).toString(), entry.getValue().get().getData().getFileStatuses(hadoopConf.get(), partitionInfo.get(entry.getKey()))); + } } + + LOG.info("Listed files in partitions from metadata: partition list =" + Arrays.toString(partitionPaths.toArray())); + return result; } - /** - * Return the instants which are not-synced to the {@code HoodieTableMetadata}. - * - * This is the list of all completed but un-synched instants. - */ - protected abstract List findInstantsToSyncForReader(); + protected abstract Option> getRecordByKey(String key, String partitionName); - /** - * Return the instants which are not-synced to the {@code HoodieTableMetadataWriter}. - * - * This is the list of all completed but un-synched instants which do not have any incomplete instants in between them. - */ - protected abstract List findInstantsToSyncForWriter(); - - @Override - public boolean isInSync() { - return enabled && findInstantsToSyncForWriter().isEmpty(); - } + protected abstract List>>> getRecordsByKeys(List key, String partitionName); protected HoodieEngineContext getEngineContext() { return engineContext != null ? engineContext : new HoodieLocalEngineContext(hadoopConf.get()); @@ -330,15 +252,8 @@ public HoodieMetadataConfig getMetadataConfig() { return metadataConfig; } - protected String getLatestDatasetInstantTime() { - return datasetMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant() + protected String getLatestDataInstantTime() { + return dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant() .map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); } - - public Option getReaderTime() { - if (timelineMergedMetadata == null) { - return Option.empty(); - } - return timelineMergedMetadata.getSyncedInstantTime(); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java index bb3115ae30c39..f5e14ba1dd34d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java @@ -126,13 +126,13 @@ public Map getAllFilesInPartitions(List partitionP } @Override - public Option getUpdateTime() { + public Option getSyncedInstantTime() { throw new UnsupportedOperationException(); } @Override - public boolean isInSync() { - return true; + public Option getLatestCompactionTime() { + throw new UnsupportedOperationException(); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java index 554a165623b22..1ee8a78e3e2ca 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java @@ -20,6 +20,8 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; @@ -32,9 +34,9 @@ import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SpillableMapUtils; @@ -42,6 +44,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; @@ -56,15 +59,15 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; /** * Table metadata provided by an internal DFS backed Hudi metadata table. - * - * If the metadata table does not exist, RPC calls are used to retrieve file listings from the file system. - * No updates are applied to the table and it is not synced. */ public class HoodieBackedTableMetadata extends BaseTableMetadata { @@ -72,16 +75,13 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { private String metadataBasePath; // Metadata table's timeline and metaclient - private HoodieTableMetaClient metaClient; - private HoodieTableConfig tableConfig; - private List latestFileSystemMetadataSlices; + private HoodieTableMetaClient metadataMetaClient; + private HoodieTableConfig metadataTableConfig; // should we reuse the open file handles, across calls private final boolean reuse; - - // Readers for the base and log file which store the metadata - private transient HoodieFileReader baseFileReader; - private transient HoodieMetadataMergedLogRecordScanner logRecordScanner; + // Readers for latest file slice corresponding to file groups in the metadata partition of interest + private Map> partitionReaders = new ConcurrentHashMap<>(); public HoodieBackedTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, String datasetBasePath, String spillableMapDirectory) { @@ -96,244 +96,290 @@ public HoodieBackedTableMetadata(HoodieEngineContext engineContext, HoodieMetada } private void initIfNeeded() { + this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(dataBasePath); if (!enabled) { - LOG.info("Metadata table is disabled for " + datasetBasePath); - } else if (this.metaClient == null) { - this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(datasetBasePath); + if (!HoodieTableMetadata.isMetadataTable(metadataBasePath)) { + LOG.info("Metadata table is disabled."); + } + } else if (this.metadataMetaClient == null) { try { - this.metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(metadataBasePath).build(); - this.tableConfig = metaClient.getTableConfig(); - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline()); - latestFileSystemMetadataSlices = fsView.getLatestFileSlices(MetadataPartitionType.FILES.partitionPath()).collect(Collectors.toList()); + this.metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(metadataBasePath).build(); + this.metadataTableConfig = metadataMetaClient.getTableConfig(); } catch (TableNotFoundException e) { LOG.warn("Metadata table was not found at path " + metadataBasePath); this.enabled = false; - this.metaClient = null; - this.tableConfig = null; + this.metadataMetaClient = null; + this.metadataTableConfig = null; } catch (Exception e) { LOG.error("Failed to initialize metadata table at path " + metadataBasePath, e); this.enabled = false; - this.metaClient = null; - this.tableConfig = null; - } - - if (enabled) { - openTimelineScanner(metaClient.getActiveTimeline()); + this.metadataMetaClient = null; + this.metadataTableConfig = null; } } } @Override - protected Option> getRecordByKeyFromMetadata(String key) { + protected Option> getRecordByKey(String key, String partitionName) { + return getRecordsByKeys(Collections.singletonList(key), partitionName).get(0).getValue(); + } - openReadersIfNeededOrThrow(); + protected List>>> getRecordsByKeys(List keys, String partitionName) { + Pair readers = openReadersIfNeeded(keys.get(0), partitionName); try { List timings = new ArrayList<>(); - HoodieTimer timer = new HoodieTimer().startTimer(); + HoodieFileReader baseFileReader = readers.getKey(); + HoodieMetadataMergedLogRecordReader logRecordScanner = readers.getRight(); + + // local map to assist in merging with base file records + Map>> logRecords = readLogRecords(logRecordScanner, keys, timings); + List>>> result = readFromBaseAndMergeWithLogRecords(baseFileReader, + keys, logRecords, timings); + LOG.info(String.format("Metadata read for %s keys took [baseFileRead, logMerge] %s ms", keys.size(), timings)); + return result; + } catch (IOException ioe) { + throw new HoodieIOException("Error merging records from metadata table for " + keys.size() + " key : ", ioe); + } finally { + if (!reuse) { + close(partitionName); + } + } + } - // Retrieve record from base file - HoodieRecord hoodieRecord = null; - if (baseFileReader != null) { - HoodieTimer readTimer = new HoodieTimer().startTimer(); - Option baseRecord = baseFileReader.getRecordByKey(key); - if (baseRecord.isPresent()) { - hoodieRecord = tableConfig.populateMetaFields() - ? SpillableMapUtils.convertToHoodieRecordPayload(baseRecord.get(), tableConfig.getPayloadClass(), tableConfig.getPreCombineField(), false) - : SpillableMapUtils.convertToHoodieRecordPayload(baseRecord.get(), tableConfig.getPayloadClass(), tableConfig.getPreCombineField(), - Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp()), false); - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer())); + private Map>> readLogRecords(HoodieMetadataMergedLogRecordReader logRecordScanner, + List keys, List timings) { + HoodieTimer timer = new HoodieTimer().startTimer(); + Map>> logRecords = new HashMap<>(); + // Retrieve records from log file + timer.startTimer(); + if (logRecordScanner != null) { + if (metadataConfig.enableFullScan()) { + // path which does full scan of log files + for (String key : keys) { + logRecords.put(key, logRecordScanner.getRecordByKey(key).get(0).getValue()); + } + } else { + // this path will do seeks pertaining to the keys passed in + List>>> logRecordsList = logRecordScanner.getRecordsByKeys(keys); + for (Pair>> entry : logRecordsList) { + logRecords.put(entry.getKey(), entry.getValue()); } } - timings.add(timer.endTimer()); + } else { + for (String key : keys) { + logRecords.put(key, Option.empty()); + } + } + timings.add(timer.endTimer()); + return logRecords; + } - // Retrieve record from log file - timer.startTimer(); - if (logRecordScanner != null) { - Option> logHoodieRecord = logRecordScanner.getRecordByKey(key); - if (logHoodieRecord.isPresent()) { - if (hoodieRecord != null) { - // Merge the payloads - HoodieRecordPayload mergedPayload = logHoodieRecord.get().getData().preCombine(hoodieRecord.getData()); - hoodieRecord = new HoodieRecord(hoodieRecord.getKey(), mergedPayload); + private List>>> readFromBaseAndMergeWithLogRecords(HoodieFileReader baseFileReader, + List keys, Map>> logRecords, + List timings) throws IOException { + List>>> result = new ArrayList<>(); + // merge with base records + HoodieTimer timer = new HoodieTimer().startTimer(); + timer.startTimer(); + HoodieRecord hoodieRecord = null; + // Retrieve record from base file + if (baseFileReader != null) { + HoodieTimer readTimer = new HoodieTimer().startTimer(); + for (String key : keys) { + Option baseRecord = baseFileReader.getRecordByKey(key); + if (baseRecord.isPresent()) { + hoodieRecord = metadataTableConfig.populateMetaFields() + ? SpillableMapUtils.convertToHoodieRecordPayload(baseRecord.get(), metadataTableConfig.getPayloadClass(), metadataTableConfig.getPreCombineField(), false) + : SpillableMapUtils.convertToHoodieRecordPayload(baseRecord.get(), metadataTableConfig.getPayloadClass(), metadataTableConfig.getPreCombineField(), + Pair.of(metadataTableConfig.getRecordKeyFieldProp(), metadataTableConfig.getPartitionFieldProp()), false); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer())); + // merge base file record w/ log record if present + if (logRecords.containsKey(key) && logRecords.get(key).isPresent()) { + HoodieRecordPayload mergedPayload = logRecords.get(key).get().getData().preCombine(hoodieRecord.getData()); + result.add(Pair.of(key, Option.of(new HoodieRecord(hoodieRecord.getKey(), mergedPayload)))); } else { - hoodieRecord = logHoodieRecord.get(); + // only base record + result.add(Pair.of(key, Option.of(hoodieRecord))); } + } else { + // only log record + result.add(Pair.of(key, logRecords.get(key))); } } timings.add(timer.endTimer()); - LOG.info(String.format("Metadata read for key %s took [baseFileRead, logMerge] %s ms", key, timings)); - return Option.ofNullable(hoodieRecord); - } catch (IOException ioe) { - throw new HoodieIOException("Error merging records from metadata table for key :" + key, ioe); - } finally { - if (!reuse) { - closeOrThrow(); + } else { + // no base file at all + timings.add(timer.endTimer()); + for (Map.Entry>> entry : logRecords.entrySet()) { + result.add(Pair.of(entry.getKey(), entry.getValue())); } } - } - - private void openReadersIfNeededOrThrow() { - try { - openReadersIfNeeded(); - } catch (IOException e) { - throw new HoodieIOException("Error opening readers to the Metadata Table: ", e); - } + return result; } /** * Returns a new pair of readers to the base and log files. */ - private void openReadersIfNeeded() throws IOException { - if (reuse && (baseFileReader != null || logRecordScanner != null)) { - // quickly exit out without synchronizing if reusing and readers are already open - return; - } - - // we always force synchronization, if reuse=false, to handle concurrent close() calls as well. - synchronized (this) { - if (baseFileReader != null || logRecordScanner != null) { - return; - } - - final long baseFileOpenMs; - final long logScannerOpenMs; - - // Metadata is in sync till the latest completed instant on the dataset - HoodieTimer timer = new HoodieTimer().startTimer(); - String latestInstantTime = getLatestDatasetInstantTime(); - ValidationUtils.checkArgument(latestFileSystemMetadataSlices.size() == 1, "must be at-least one valid metadata file slice"); - - // If the base file is present then create a reader - Option basefile = latestFileSystemMetadataSlices.get(0).getBaseFile(); - if (basefile.isPresent()) { - String basefilePath = basefile.get().getPath(); - baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath)); - baseFileOpenMs = timer.endTimer(); - LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath, - basefile.get().getCommitTime(), baseFileOpenMs)); - } else { - baseFileOpenMs = 0; - timer.endTimer(); + private Pair openReadersIfNeeded(String key, String partitionName) { + return partitionReaders.computeIfAbsent(partitionName, k -> { + try { + final long baseFileOpenMs; + final long logScannerOpenMs; + HoodieFileReader baseFileReader = null; + HoodieMetadataMergedLogRecordReader logRecordScanner = null; + + // Metadata is in sync till the latest completed instant on the dataset + HoodieTimer timer = new HoodieTimer().startTimer(); + List latestFileSlices = HoodieTableMetadataUtil.loadPartitionFileGroupsWithLatestFileSlices(metadataMetaClient, partitionName); + ValidationUtils.checkArgument(latestFileSlices.size() == 1, String.format("Invalid number of file slices: found=%d, required=%d", latestFileSlices.size(), 1)); + final FileSlice slice = latestFileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(key, latestFileSlices.size())); + + // Open base file reader + Pair baseFileReaderOpenTimePair = getBaseFileReader(slice, timer); + baseFileReader = baseFileReaderOpenTimePair.getKey(); + baseFileOpenMs = baseFileReaderOpenTimePair.getValue(); + + // Open the log record scanner using the log files from the latest file slice + Pair logRecordScannerOpenTimePair = getLogRecordScanner(slice); + logRecordScanner = logRecordScannerOpenTimePair.getKey(); + logScannerOpenMs = logRecordScannerOpenTimePair.getValue(); + + metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, baseFileOpenMs + logScannerOpenMs)); + return Pair.of(baseFileReader, logRecordScanner); + } catch (IOException e) { + throw new HoodieIOException("Error opening readers for metadata table partition " + partitionName, e); } - - // Open the log record scanner using the log files from the latest file slice - timer.startTimer(); - List logFilePaths = latestFileSystemMetadataSlices.get(0).getLogFiles() - .sorted(HoodieLogFile.getLogFileComparator()) - .map(o -> o.getPath().toString()) - .collect(Collectors.toList()); - Option lastInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant(); - String latestMetaInstantTimestamp = lastInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); - - // Load the schema - Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema()); - HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build(); - logRecordScanner = HoodieMetadataMergedLogRecordScanner.newBuilder() - .withFileSystem(metaClient.getFs()) - .withBasePath(metadataBasePath) - .withLogFilePaths(logFilePaths) - .withReaderSchema(schema) - .withLatestInstantTime(latestMetaInstantTimestamp) - .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES) - .withBufferSize(BUFFER_SIZE) - .withSpillableMapBasePath(spillableMapDirectory) - .withDiskMapType(commonConfig.getSpillableDiskMapType()) - .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled()) - .build(); - - logScannerOpenMs = timer.endTimer(); - LOG.info(String.format("Opened metadata log files from %s at instant (dataset instant=%s, metadata instant=%s) in %d ms", - logFilePaths, latestInstantTime, latestMetaInstantTimestamp, logScannerOpenMs)); - - metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, baseFileOpenMs + logScannerOpenMs)); - } + }); } - private void close(HoodieFileReader localFileReader, HoodieMetadataMergedLogRecordScanner localLogScanner) { - try { - if (localFileReader != null) { - localFileReader.close(); - } - if (localLogScanner != null) { - localLogScanner.close(); - } - } catch (Exception e) { - throw new HoodieException("Error closing resources during metadata table merge", e); + private Pair getBaseFileReader(FileSlice slice, HoodieTimer timer) throws IOException { + HoodieFileReader baseFileReader = null; + Long baseFileOpenMs; + // If the base file is present then create a reader + Option basefile = slice.getBaseFile(); + if (basefile.isPresent()) { + String basefilePath = basefile.get().getPath(); + baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath)); + baseFileOpenMs = timer.endTimer(); + LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath, + basefile.get().getCommitTime(), baseFileOpenMs)); + } else { + baseFileOpenMs = 0L; + timer.endTimer(); } + return Pair.of(baseFileReader, baseFileOpenMs); } - private void closeOrThrow() { - try { - close(); - } catch (Exception e) { - throw new HoodieException("Error closing metadata table readers", e); - } + private Set getValidInstantTimestamps() { + // Only those log files which have a corresponding completed instant on the dataset should be read + // This is because the metadata table is updated before the dataset instants are committed. + HoodieActiveTimeline datasetTimeline = dataMetaClient.getActiveTimeline(); + Set validInstantTimestamps = datasetTimeline.filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); + + // For any rollbacks and restores, we cannot neglect the instants that they are rolling back. + // The rollback instant should be more recent than the start of the timeline for it to have rolled back any + // instant which we have a log block for. + final String earliestInstantTime = validInstantTimestamps.isEmpty() ? SOLO_COMMIT_TIMESTAMP : Collections.min(validInstantTimestamps); + datasetTimeline.getRollbackAndRestoreTimeline().filterCompletedInstants().getInstants() + .filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN, earliestInstantTime)) + .forEach(instant -> { + validInstantTimestamps.addAll(getRollbackedCommits(instant, datasetTimeline)); + }); + + // SOLO_COMMIT_TIMESTAMP is used during bootstrap so it is a valid timestamp + validInstantTimestamps.add(SOLO_COMMIT_TIMESTAMP); + return validInstantTimestamps; } - @Override - public synchronized void close() throws Exception { - close(baseFileReader, logRecordScanner); - baseFileReader = null; - logRecordScanner = null; + private Pair getLogRecordScanner(FileSlice slice) { + HoodieTimer timer = new HoodieTimer().startTimer(); + List logFilePaths = slice.getLogFiles() + .sorted(HoodieLogFile.getLogFileComparator()) + .map(o -> o.getPath().toString()) + .collect(Collectors.toList()); + + // Only those log files which have a corresponding completed instant on the dataset should be read + // This is because the metadata table is updated before the dataset instants are committed. + Set validInstantTimestamps = getValidInstantTimestamps(); + + Option latestMetadataInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant(); + String latestMetadataInstantTime = latestMetadataInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); + + // Load the schema + Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema()); + HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build(); + HoodieMetadataMergedLogRecordReader logRecordScanner = HoodieMetadataMergedLogRecordReader.newBuilder() + .withFileSystem(metadataMetaClient.getFs()) + .withBasePath(metadataBasePath) + .withLogFilePaths(logFilePaths) + .withReaderSchema(schema) + .withLatestInstantTime(latestMetadataInstantTime) + .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES) + .withBufferSize(BUFFER_SIZE) + .withSpillableMapBasePath(spillableMapDirectory) + .withDiskMapType(commonConfig.getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled()) + .withLogBlockTimestamps(validInstantTimestamps) + .enableFullScan(metadataConfig.enableFullScan()) + .build(); + + Long logScannerOpenMs = timer.endTimer(); + LOG.info(String.format("Opened %d metadata log files (dataset instant=%s, metadata instant=%s) in %d ms", + logFilePaths.size(), getLatestDataInstantTime(), latestMetadataInstantTime, logScannerOpenMs)); + return Pair.of(logRecordScanner, logScannerOpenMs); } /** - * Return the timestamp of the latest synced instant. + * Returns a list of commits which were rolled back as part of a Rollback or Restore operation. + * + * @param instant The Rollback operation to read + * @param timeline instant of timeline from dataset. */ - @Override - public Option getUpdateTime() { - if (!enabled) { - return Option.empty(); - } - - HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline(); - return timeline.getDeltaCommitTimeline().filterCompletedInstants() - .lastInstant().map(HoodieInstant::getTimestamp); - } + private List getRollbackedCommits(HoodieInstant instant, HoodieActiveTimeline timeline) { + try { + if (instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) { + HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata( + timeline.getInstantDetails(instant).get()); + return rollbackMetadata.getCommitsRollback(); + } - /** - * Return an ordered list of instants which have not been synced to the Metadata Table. - */ - @Override - protected List findInstantsToSyncForReader() { - return findInstantsToSync(true); + List rollbackedCommits = new LinkedList<>(); + if (instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)) { + // Restore is made up of several rollbacks + HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeHoodieRestoreMetadata( + timeline.getInstantDetails(instant).get()); + restoreMetadata.getHoodieRestoreMetadata().values().forEach(rms -> { + rms.forEach(rm -> rollbackedCommits.addAll(rm.getCommitsRollback())); + }); + } + return rollbackedCommits; + } catch (IOException e) { + throw new HoodieMetadataException("Error retrieving rollback commits for instant " + instant, e); + } } - /** - * Return an ordered list of instants which have not been synced to the Metadata Table. - */ @Override - protected List findInstantsToSyncForWriter() { - return findInstantsToSync(false); - } - - /** - * Return an ordered list of instants which have not been synced to the Metadata Table. - */ - private List findInstantsToSync(boolean ignoreIncompleteInstants) { - initIfNeeded(); - - // if there are no instants yet, return empty list, since there is nothing to sync here. - if (!enabled || !metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().isPresent()) { - return Collections.EMPTY_LIST; + public void close() { + for (String partitionName : partitionReaders.keySet()) { + close(partitionName); } + partitionReaders.clear(); + } - // All instants on the data timeline, which are greater than the last deltacommit instant on metadata timeline - // are candidates for sync. We only consider delta-commit instants as each actions on dataset leads to a - // deltacommit on the metadata table. - String latestMetadataInstantTime = metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants() - .lastInstant().get().getTimestamp(); - HoodieDefaultTimeline candidateTimeline = datasetMetaClient.getActiveTimeline().findInstantsAfter(latestMetadataInstantTime, Integer.MAX_VALUE); - Option earliestIncompleteInstant = ignoreIncompleteInstants ? Option.empty() - : candidateTimeline.filterInflightsAndRequested().firstInstant(); - - if (earliestIncompleteInstant.isPresent()) { - return candidateTimeline.filterCompletedInstants() - .findInstantsBefore(earliestIncompleteInstant.get().getTimestamp()) - .getInstants().collect(Collectors.toList()); - } else { - return candidateTimeline.filterCompletedInstants() - .getInstants().collect(Collectors.toList()); + private synchronized void close(String partitionName) { + Pair readers = partitionReaders.remove(partitionName); + if (readers != null) { + try { + if (readers.getKey() != null) { + readers.getKey().close(); + } + if (readers.getValue() != null) { + readers.getValue().close(); + } + } catch (Exception e) { + throw new HoodieException("Error closing resources during metadata table merge", e); + } } } @@ -345,11 +391,33 @@ public SerializableConfiguration getHadoopConf() { return hadoopConf; } - public HoodieTableMetaClient getMetaClient() { - return metaClient; + public HoodieTableMetaClient getMetadataMetaClient() { + return metadataMetaClient; } public Map stats() { - return metrics.map(m -> m.getStats(true, metaClient, this)).orElse(new HashMap<>()); + return metrics.map(m -> m.getStats(true, metadataMetaClient, this)).orElse(new HashMap<>()); + } + + @Override + public Option getSyncedInstantTime() { + if (metadataMetaClient != null) { + Option latestInstant = metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant(); + if (latestInstant.isPresent()) { + return Option.of(latestInstant.get().getTimestamp()); + } + } + return Option.empty(); + } + + @Override + public Option getLatestCompactionTime() { + if (metadataMetaClient != null) { + Option latestCompaction = metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); + if (latestCompaction.isPresent()) { + return Option.of(latestCompaction.get().getTimestamp()); + } + } + return Option.empty(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java similarity index 61% rename from hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordScanner.java rename to hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java index a3c3e086f24c9..131ca3b91762f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java @@ -19,36 +19,47 @@ package org.apache.hudi.metadata; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.log.InstantRange; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.common.util.collection.Pair; /** * A {@code HoodieMergedLogRecordScanner} implementation which only merged records matching providing keys. This is * useful in limiting memory usage when only a small subset of updates records are to be read. */ -public class HoodieMetadataMergedLogRecordScanner extends HoodieMergedLogRecordScanner { +public class HoodieMetadataMergedLogRecordReader extends HoodieMergedLogRecordScanner { + + private static final Logger LOG = LogManager.getLogger(HoodieMetadataMergedLogRecordReader.class); + // Set of all record keys that are to be read in memory private Set mergeKeyFilter; - private HoodieMetadataMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, + private HoodieMetadataMergedLogRecordReader(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, String latestInstantTime, Long maxMemorySizeInBytes, int bufferSize, String spillableMapBasePath, Set mergeKeyFilter, - ExternalSpillableMap.DiskMapType diskMapType, boolean isBitCaskDiskMapCompressionEnabled) { + ExternalSpillableMap.DiskMapType diskMapType, boolean isBitCaskDiskMapCompressionEnabled, + Option instantRange, boolean enableFullScan) { super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, false, false, bufferSize, - spillableMapBasePath, Option.empty(), false, diskMapType, isBitCaskDiskMapCompressionEnabled, false); + spillableMapBasePath, instantRange, false, diskMapType, isBitCaskDiskMapCompressionEnabled, false, enableFullScan); this.mergeKeyFilter = mergeKeyFilter; - - performScan(); + if (enableFullScan) { + performScan(); + } } @Override @@ -68,8 +79,8 @@ protected void processNextDeletedKey(HoodieKey hoodieKey) { /** * Returns the builder for {@code HoodieMetadataMergedLogRecordScanner}. */ - public static HoodieMetadataMergedLogRecordScanner.Builder newBuilder() { - return new HoodieMetadataMergedLogRecordScanner.Builder(); + public static HoodieMetadataMergedLogRecordReader.Builder newBuilder() { + return new HoodieMetadataMergedLogRecordReader.Builder(); } /** @@ -78,8 +89,22 @@ public static HoodieMetadataMergedLogRecordScanner.Builder newBuilder() { * @param key Key of the record to retrieve * @return {@code HoodieRecord} if key was found else {@code Option.empty()} */ - public Option> getRecordByKey(String key) { - return Option.ofNullable((HoodieRecord) records.get(key)); + public List>>> getRecordByKey(String key) { + return Collections.singletonList(Pair.of(key, Option.ofNullable((HoodieRecord) records.get(key)))); + } + + public List>>> getRecordsByKeys(List keys) { + records.clear(); + scan(Option.of(keys)); + List>>> metadataRecords = new ArrayList<>(); + keys.forEach(entry -> { + if (records.containsKey(entry)) { + metadataRecords.add(Pair.of(entry, Option.ofNullable((HoodieRecord) records.get(entry)))); + } else { + metadataRecords.add(Pair.of(entry, Option.empty())); + } + }); + return metadataRecords; } /** @@ -87,60 +112,74 @@ public Option> getRecordByKey(String key) { */ public static class Builder extends HoodieMergedLogRecordScanner.Builder { private Set mergeKeyFilter = Collections.emptySet(); + private boolean enableFullScan; + private boolean enableInlineReading; + @Override public Builder withFileSystem(FileSystem fs) { this.fs = fs; return this; } + @Override public Builder withBasePath(String basePath) { this.basePath = basePath; return this; } + @Override public Builder withLogFilePaths(List logFilePaths) { this.logFilePaths = logFilePaths; return this; } + @Override public Builder withReaderSchema(Schema schema) { this.readerSchema = schema; return this; } + @Override public Builder withLatestInstantTime(String latestInstantTime) { this.latestInstantTime = latestInstantTime; return this; } + @Override public Builder withReadBlocksLazily(boolean readBlocksLazily) { throw new UnsupportedOperationException(); } + @Override public Builder withReverseReader(boolean reverseReader) { throw new UnsupportedOperationException(); } + @Override public Builder withBufferSize(int bufferSize) { this.bufferSize = bufferSize; return this; } + @Override public Builder withMaxMemorySizeInBytes(Long maxMemorySizeInBytes) { this.maxMemorySizeInBytes = maxMemorySizeInBytes; return this; } + @Override public Builder withSpillableMapBasePath(String spillableMapBasePath) { this.spillableMapBasePath = spillableMapBasePath; return this; } + @Override public Builder withDiskMapType(ExternalSpillableMap.DiskMapType diskMapType) { this.diskMapType = diskMapType; return this; } + @Override public Builder withBitCaskDiskMapCompressionEnabled(boolean isBitCaskDiskMapCompressionEnabled) { this.isBitCaskDiskMapCompressionEnabled = isBitCaskDiskMapCompressionEnabled; return this; @@ -151,11 +190,38 @@ public Builder withMergeKeyFilter(Set mergeKeyFilter) { return this; } + public Builder withLogBlockTimestamps(Set validLogBlockTimestamps) { + withInstantRange(Option.of(new ExplicitMatchRange(validLogBlockTimestamps))); + return this; + } + + public Builder enableFullScan(boolean enableFullScan) { + this.enableFullScan = enableFullScan; + return this; + } + @Override - public HoodieMetadataMergedLogRecordScanner build() { - return new HoodieMetadataMergedLogRecordScanner(fs, basePath, logFilePaths, readerSchema, + public HoodieMetadataMergedLogRecordReader build() { + return new HoodieMetadataMergedLogRecordReader(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, bufferSize, spillableMapBasePath, mergeKeyFilter, - diskMapType, isBitCaskDiskMapCompressionEnabled); + diskMapType, isBitCaskDiskMapCompressionEnabled, instantRange, enableFullScan); + } + } + + /** + * Class to assist in checking if an instant is part of a set of instants. + */ + private static class ExplicitMatchRange extends InstantRange { + Set instants; + + public ExplicitMatchRange(Set instants) { + super(Collections.min(instants), Collections.max(instants)); + this.instants = instants; + } + + @Override + public boolean isInRange(String instant) { + return this.instants.contains(instant); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java index 5924d8989e6b0..2efc96c6f3dee 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java @@ -41,13 +41,9 @@ public class HoodieMetadataMetrics implements Serializable { // Metric names public static final String LOOKUP_PARTITIONS_STR = "lookup_partitions"; public static final String LOOKUP_FILES_STR = "lookup_files"; - public static final String VALIDATE_PARTITIONS_STR = "validate_partitions"; - public static final String VALIDATE_FILES_STR = "validate_files"; - public static final String VALIDATE_ERRORS_STR = "validate_errors"; public static final String SCAN_STR = "scan"; public static final String BASEFILE_READ_STR = "basefile_read"; public static final String INITIALIZE_STR = "initialize"; - public static final String SYNC_STR = "sync"; public static final String REBOOTSTRAP_STR = "rebootstrap"; public static final String BOOTSTRAP_ERR_STR = "bootstrap_error"; @@ -57,7 +53,6 @@ public class HoodieMetadataMetrics implements Serializable { public static final String STAT_COUNT_BASE_FILES = "baseFileCount"; public static final String STAT_COUNT_LOG_FILES = "logFileCount"; public static final String STAT_COUNT_PARTITION = "partitionCount"; - public static final String STAT_IN_SYNC = "isInSync"; public static final String STAT_LAST_COMPACTION_TIMESTAMP = "lastCompactionTimestamp"; private static final Logger LOG = LogManager.getLogger(HoodieMetadataMetrics.class); @@ -82,32 +77,35 @@ private Map getStats(HoodieTableFileSystemView fsView, boolean d Map stats = new HashMap<>(); // Total size of the metadata and count of base/log files - long totalBaseFileSizeInBytes = 0; - long totalLogFileSizeInBytes = 0; - int baseFileCount = 0; - int logFileCount = 0; - List latestSlices = fsView.getLatestFileSlices(MetadataPartitionType.FILES.partitionPath()).collect(Collectors.toList()); - - for (FileSlice slice : latestSlices) { - if (slice.getBaseFile().isPresent()) { - totalBaseFileSizeInBytes += slice.getBaseFile().get().getFileStatus().getLen(); - ++baseFileCount; + for (String metadataPartition : MetadataPartitionType.all()) { + List latestSlices = fsView.getLatestFileSlices(metadataPartition).collect(Collectors.toList()); + + // Total size of the metadata and count of base/log files + long totalBaseFileSizeInBytes = 0; + long totalLogFileSizeInBytes = 0; + int baseFileCount = 0; + int logFileCount = 0; + + for (FileSlice slice : latestSlices) { + if (slice.getBaseFile().isPresent()) { + totalBaseFileSizeInBytes += slice.getBaseFile().get().getFileStatus().getLen(); + ++baseFileCount; + } + Iterator it = slice.getLogFiles().iterator(); + while (it.hasNext()) { + totalLogFileSizeInBytes += it.next().getFileSize(); + ++logFileCount; + } } - Iterator it = slice.getLogFiles().iterator(); - while (it.hasNext()) { - totalLogFileSizeInBytes += it.next().getFileStatus().getLen(); - ++logFileCount; - } - } - stats.put(HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE, String.valueOf(totalBaseFileSizeInBytes)); - stats.put(HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE, String.valueOf(totalLogFileSizeInBytes)); - stats.put(HoodieMetadataMetrics.STAT_COUNT_BASE_FILES, String.valueOf(baseFileCount)); - stats.put(HoodieMetadataMetrics.STAT_COUNT_LOG_FILES, String.valueOf(logFileCount)); + stats.put(metadataPartition + "." + STAT_TOTAL_BASE_FILE_SIZE, String.valueOf(totalBaseFileSizeInBytes)); + stats.put(metadataPartition + "." + STAT_TOTAL_LOG_FILE_SIZE, String.valueOf(totalLogFileSizeInBytes)); + stats.put(metadataPartition + "." + STAT_COUNT_BASE_FILES, String.valueOf(baseFileCount)); + stats.put(metadataPartition + "." + STAT_COUNT_LOG_FILES, String.valueOf(logFileCount)); + } if (detailed) { stats.put(HoodieMetadataMetrics.STAT_COUNT_PARTITION, String.valueOf(tableMetadata.getAllPartitionPaths().size())); - stats.put(HoodieMetadataMetrics.STAT_IN_SYNC, String.valueOf(tableMetadata.isInSync())); } return stats; @@ -121,26 +119,20 @@ protected void updateMetrics(String action, long durationInMs) { // Update sum of duration and total for count String countKey = action + ".count"; String durationKey = action + ".totalDuration"; - metricsRegistry.add(countKey, 1); - metricsRegistry.add(durationKey, durationInMs); - - LOG.info(String.format("Updating metadata metrics (%s=%dms, %s=1)", durationKey, durationInMs, countKey)); + incrementMetric(countKey, 1); + incrementMetric(durationKey, durationInMs); } - public void updateMetrics(long totalBaseFileSizeInBytes, long totalLogFileSizeInBytes, int baseFileCount, - int logFileCount) { - if (metricsRegistry == null) { - return; + public void updateSizeMetrics(HoodieTableMetaClient metaClient, HoodieBackedTableMetadata metadata) { + Map stats = getStats(false, metaClient, metadata); + for (Map.Entry e : stats.entrySet()) { + incrementMetric(e.getKey(), Long.parseLong(e.getValue())); } + } - // Set new size and count for metadata table's data files - metricsRegistry.set("basefile.size", totalBaseFileSizeInBytes); - metricsRegistry.set("logfile.size", totalLogFileSizeInBytes); - metricsRegistry.set("basefile.count", baseFileCount); - metricsRegistry.set("logfile.count", logFileCount); - - LOG.info(String.format("Updating metadata size metrics (basefile.size=%d, logfile.size=%d, basefile.count=%d, " - + "logfile.count=%d)", totalBaseFileSizeInBytes, totalLogFileSizeInBytes, baseFileCount, logFileCount)); + protected void incrementMetric(String action, long value) { + LOG.info(String.format("Updating metadata metrics (%s=%d) in %s", action, value, metricsRegistry)); + metricsRegistry.add(action, value); } public Registry registry() { diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java index 3964cd1006b75..f5c1762610390 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java @@ -23,7 +23,6 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; - import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; @@ -44,7 +43,7 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable { * {@link org.apache.hudi.common.table.timeline.HoodieTimeline#INIT_INSTANT_TS}, such that the metadata table * can be prepped even before bootstrap is done. */ - String SOLO_COMMIT_TIMESTAMP = "0000000000000"; + String SOLO_COMMIT_TIMESTAMP = "00000000000000"; // Key for the record which saves list of all partitions String RECORDKEY_PARTITION_LIST = "__all_partitions__"; // The partition name used for non-partitioned tables @@ -105,11 +104,12 @@ static HoodieTableMetadata create(HoodieEngineContext engineContext, HoodieMetad Map getAllFilesInPartitions(List partitionPaths) throws IOException; /** - * Get the instant time at which Metadata Table was last updated. - * - * This is the timestamp of the Instant on the dataset which was last synced to the Metadata Table. + * Get the instant time to which the metadata is synced w.r.t data timeline. */ - Option getUpdateTime(); + Option getSyncedInstantTime(); - boolean isInSync(); + /** + * Returns the timestamp of the latest compaction. + */ + Option getLatestCompactionTime(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 14fe07b32c1bb..b028056bb70d9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -19,28 +19,30 @@ package org.apache.hudi.metadata; import org.apache.hudi.avro.model.HoodieCleanMetadata; -import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; -import org.apache.hudi.common.util.CleanerUtils; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieMetadataException; + +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; @@ -59,56 +61,20 @@ public class HoodieTableMetadataUtil { private static final Logger LOG = LogManager.getLogger(HoodieTableMetadataUtil.class); /** - * Converts a timeline instant to metadata table records. + * Delete the metadata table for the dataset. This will be invoked during upgrade/downgrade operation during which no other + * process should be running. * - * @param datasetMetaClient The meta client associated with the timeline instant - * @param metadataTableTimeline Current timeline of the Metadata Table - * @param instant to fetch and convert to metadata table records - * @return a list of metadata table records - * @throws IOException + * @param basePath base path of the dataset + * @param context instance of {@link HoodieEngineContext}. */ - public static Option> convertInstantToMetaRecords(HoodieTableMetaClient datasetMetaClient, - HoodieActiveTimeline metadataTableTimeline, HoodieInstant instant, Option lastSyncTs) throws IOException { - HoodieTimeline timeline = datasetMetaClient.getActiveTimeline(); - Option> records = Option.empty(); - ValidationUtils.checkArgument(instant.isCompleted(), "Only completed instants can be synced."); - - switch (instant.getAction()) { - case HoodieTimeline.CLEAN_ACTION: - HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(datasetMetaClient, instant); - records = Option.of(convertMetadataToRecords(cleanMetadata, instant.getTimestamp())); - break; - case HoodieTimeline.DELTA_COMMIT_ACTION: - case HoodieTimeline.COMMIT_ACTION: - case HoodieTimeline.COMPACTION_ACTION: - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); - records = Option.of(convertMetadataToRecords(commitMetadata, instant.getTimestamp())); - break; - case HoodieTimeline.ROLLBACK_ACTION: - HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata( - timeline.getInstantDetails(instant).get()); - records = Option.of(convertMetadataToRecords(metadataTableTimeline, rollbackMetadata, instant.getTimestamp(), lastSyncTs)); - break; - case HoodieTimeline.RESTORE_ACTION: - HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeHoodieRestoreMetadata( - timeline.getInstantDetails(instant).get()); - records = Option.of(convertMetadataToRecords(metadataTableTimeline, restoreMetadata, instant.getTimestamp(), lastSyncTs)); - break; - case HoodieTimeline.SAVEPOINT_ACTION: - // Nothing to be done here - break; - case HoodieTimeline.REPLACE_COMMIT_ACTION: - HoodieReplaceCommitMetadata replaceMetadata = HoodieReplaceCommitMetadata.fromBytes( - timeline.getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class); - // Note: we only add new files created here. Replaced files are removed from metadata later by cleaner. - records = Option.of(convertMetadataToRecords(replaceMetadata, instant.getTimestamp())); - break; - default: - throw new HoodieException("Unknown type of action " + instant.getAction()); + public static void deleteMetadataTable(String basePath, HoodieEngineContext context) { + final String metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + FileSystem fs = FSUtils.getFs(metadataTablePath, context.getHadoopConf().get()); + try { + fs.delete(new Path(metadataTablePath), true); + } catch (Exception e) { + throw new HoodieMetadataException("Failed to remove metadata table from path " + metadataTablePath, e); } - - return records; } /** @@ -134,12 +100,11 @@ public static List convertMetadataToRecords(HoodieCommitMetadata c return; } - int offset = partition.equals(NON_PARTITIONED_NAME) ? 0 : partition.length() + 1; + int offset = partition.equals(NON_PARTITIONED_NAME) ? (pathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1; String filename = pathWithPartition.substring(offset); ValidationUtils.checkState(!newFiles.containsKey(filename), "Duplicate files in HoodieCommitMetadata"); newFiles.put(filename, hoodieWriteStat.getTotalWriteBytes()); }); - // New files added to a partition HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord( partition, Option.of(newFiles), Option.empty()); @@ -155,33 +120,6 @@ public static List convertMetadataToRecords(HoodieCommitMetadata c return records; } - /** - * Finds all files that will be deleted as part of a planned clean and creates metadata table records for them. - * - * @param cleanerPlan from timeline to convert - * @param instantTime - * @return a list of metadata table records - */ - public static List convertMetadataToRecords(HoodieCleanerPlan cleanerPlan, String instantTime) { - List records = new LinkedList<>(); - - int[] fileDeleteCount = {0}; - cleanerPlan.getFilePathsToBeDeletedPerPartition().forEach((partition, deletedPathInfo) -> { - fileDeleteCount[0] += deletedPathInfo.size(); - - // Files deleted from a partition - List deletedFilenames = deletedPathInfo.stream().map(p -> new Path(p.getFilePath()).getName()) - .collect(Collectors.toList()); - HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(partition, Option.empty(), - Option.of(deletedFilenames)); - records.add(record); - }); - - LOG.info("Found at " + instantTime + " from CleanerPlan. #partitions_updated=" + records.size() - + ", #files_deleted=" + fileDeleteCount[0]); - return records; - } - /** * Finds all files that were deleted as part of a clean and creates metadata table records for them. * @@ -192,10 +130,9 @@ public static List convertMetadataToRecords(HoodieCleanerPlan clea public static List convertMetadataToRecords(HoodieCleanMetadata cleanMetadata, String instantTime) { List records = new LinkedList<>(); int[] fileDeleteCount = {0}; - cleanMetadata.getPartitionMetadata().forEach((partition, partitionMetadata) -> { // Files deleted from a partition - List deletedFiles = partitionMetadata.getSuccessDeleteFiles(); + List deletedFiles = partitionMetadata.getDeletePathPatterns(); HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(partition, Option.empty(), Option.of(new ArrayList<>(deletedFiles))); @@ -228,10 +165,17 @@ public static List convertMetadataToRecords(HoodieActiveTimeline m } public static List convertMetadataToRecords(HoodieActiveTimeline metadataTableTimeline, - HoodieRollbackMetadata rollbackMetadata, String instantTime, Option lastSyncTs) { + HoodieRollbackMetadata rollbackMetadata, String instantTime, + Option lastSyncTs, boolean wasSynced) { + Map> partitionToAppendedFiles = new HashMap<>(); Map> partitionToDeletedFiles = new HashMap<>(); processRollbackMetadata(metadataTableTimeline, rollbackMetadata, partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs); + if (!wasSynced) { + // Since the instant-being-rolled-back was never committed to the metadata table, the files added there + // need not be deleted. For MOR Table, the rollback appends logBlocks so we need to keep the appended files. + partitionToDeletedFiles.clear(); + } return convertFilesToRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Rollback"); } @@ -249,7 +193,6 @@ private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTi Map> partitionToDeletedFiles, Map> partitionToAppendedFiles, Option lastSyncTs) { - rollbackMetadata.getPartitionMetadata().values().forEach(pm -> { final String instantToRollback = rollbackMetadata.getCommitsRollback().get(0); // Has this rollback produced new files? @@ -285,7 +228,7 @@ private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTi } final String partition = pm.getPartitionPath(); - if (!pm.getSuccessDeleteFiles().isEmpty() && !shouldSkip) { + if ((!pm.getSuccessDeleteFiles().isEmpty() || !pm.getFailedDeleteFiles().isEmpty()) && !shouldSkip) { if (!partitionToDeletedFiles.containsKey(partition)) { partitionToDeletedFiles.put(partition, new ArrayList<>()); } @@ -293,6 +236,10 @@ private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTi // Extract deleted file name from the absolute paths saved in getSuccessDeleteFiles() List deletedFiles = pm.getSuccessDeleteFiles().stream().map(p -> new Path(p).getName()) .collect(Collectors.toList()); + if (!pm.getFailedDeleteFiles().isEmpty()) { + deletedFiles.addAll(pm.getFailedDeleteFiles().stream().map(p -> new Path(p).getName()) + .collect(Collectors.toList())); + } partitionToDeletedFiles.get(partition).addAll(deletedFiles); } @@ -364,4 +311,46 @@ private static List convertFilesToRecords(Map return records; } + + /** + * Map a record key to a file group in partition of interest. + * + * Note: For hashing, the algorithm is same as String.hashCode() but is being defined here as hashCode() + * implementation is not guaranteed by the JVM to be consistent across JVM versions and implementations. + * + * @param recordKey record key for which the file group index is looked up for. + * @return An integer hash of the given string + */ + public static int mapRecordKeyToFileGroupIndex(String recordKey, int numFileGroups) { + int h = 0; + for (int i = 0; i < recordKey.length(); ++i) { + h = 31 * h + recordKey.charAt(i); + } + + return Math.abs(Math.abs(h) % numFileGroups); + } + + /** + * Loads the list of file groups for a partition of the Metadata Table with latest file slices. + * + * The list of file slices returned is sorted in the correct order of file group name. + * @param metaClient instance of {@link HoodieTableMetaClient}. + * @param partition The name of the partition whose file groups are to be loaded. + * @return List of latest file slices for all file groups in a given partition. + */ + public static List loadPartitionFileGroupsWithLatestFileSlices(HoodieTableMetaClient metaClient, String partition) { + LOG.info("Loading file groups for metadata table partition " + partition); + + // If there are no commits on the metadata table then the table's default FileSystemView will not return any file + // slices even though we may have initialized them. + HoodieTimeline timeline = metaClient.getActiveTimeline(); + if (timeline.empty()) { + final HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.createNewInstantTime()); + timeline = new HoodieDefaultTimeline(Arrays.asList(instant).stream(), metaClient.getActiveTimeline()::getInstantDetails); + } + + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, timeline); + return fsView.getLatestFileSlices(partition).sorted((s1, s2) -> s1.getFileId().compareTo(s2.getFileId())) + .collect(Collectors.toList()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java index 0436de707d2fd..380f4d04d34a6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java @@ -18,16 +18,31 @@ package org.apache.hudi.metadata; +import java.util.Arrays; +import java.util.List; + public enum MetadataPartitionType { - FILES("files"); + FILES("files", "files-"); + // refers to partition path in metadata table. private final String partitionPath; + // refers to fileId prefix used for all file groups in this partition. + private final String fileIdPrefix; - MetadataPartitionType(String partitionPath) { + MetadataPartitionType(String partitionPath, String fileIdPrefix) { this.partitionPath = partitionPath; + this.fileIdPrefix = fileIdPrefix; } public String partitionPath() { return partitionPath; } + + public String getFileIdPrefix() { + return fileIdPrefix; + } + + public static List all() { + return Arrays.asList(MetadataPartitionType.FILES.partitionPath()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/TimelineMergedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/TimelineMergedTableMetadata.java deleted file mode 100644 index b2aca1f11cc8b..0000000000000 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/TimelineMergedTableMetadata.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.metadata; - -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieException; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; - -/** - * Provides functionality to convert timeline instants to table metadata records and then merge by key. Specify - * a filter to limit keys that are merged and stored in memory. - */ -public class TimelineMergedTableMetadata implements Serializable { - - private static final Logger LOG = LogManager.getLogger(TimelineMergedTableMetadata.class); - - HoodieTableMetaClient metaClient; - private List instants; - private Option lastSyncTs; - private Set mergeKeyFilter; - private HoodieActiveTimeline metadataTableTimeline; - - // keep it a simple hash map, so it can be easily passed onto the executors, once merged. - protected final Map> timelineMergedRecords; - - public TimelineMergedTableMetadata(HoodieTableMetaClient metaClient, HoodieActiveTimeline metadataTableTimeline, - List instants, Option lastSyncTs, Set mergeKeyFilter) { - this.metaClient = metaClient; - this.instants = instants; - this.lastSyncTs = lastSyncTs; - this.mergeKeyFilter = mergeKeyFilter != null ? mergeKeyFilter : Collections.emptySet(); - this.metadataTableTimeline = metadataTableTimeline; - this.timelineMergedRecords = new HashMap<>(); - - scan(); - } - - /** - * Converts instants in scanner to metadata table records and processes each record. - * - * @param - * @throws IOException - */ - private void scan() { - for (HoodieInstant instant : instants) { - try { - Option> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(metaClient, - metadataTableTimeline, instant, lastSyncTs); - if (records.isPresent()) { - records.get().forEach(record -> processNextRecord(record)); - } - } catch (Exception e) { - LOG.error(String.format("Got exception when processing timeline instant %s", instant.getTimestamp()), e); - throw new HoodieException(String.format("Got exception when processing timeline instant %s", instant.getTimestamp()), e); - } - } - } - - /** - * Process metadata table record by merging with existing record if it is a part of the key filter. - * - * @param hoodieRecord - */ - private void processNextRecord(HoodieRecord hoodieRecord) { - String key = hoodieRecord.getRecordKey(); - if (mergeKeyFilter.isEmpty() || mergeKeyFilter.contains(key)) { - if (timelineMergedRecords.containsKey(key)) { - // Merge and store the merged record - HoodieRecordPayload combinedValue = hoodieRecord.getData().preCombine(timelineMergedRecords.get(key).getData(), new Properties()); - timelineMergedRecords.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue)); - } else { - // Put the record as is - timelineMergedRecords.put(key, hoodieRecord); - } - } - } - - /** - * Retrieve merged hoodie record for given key. - * - * @param key of the record to retrieve - * @return {@code HoodieRecord} if key was found else {@code Option.empty()} - */ - public Option> getRecordByKey(String key) { - return Option.ofNullable((HoodieRecord) timelineMergedRecords.get(key)); - } - - /** - * Returns the timestamp of the latest synced instant. - */ - public Option getSyncedInstantTime() { - if (instants.isEmpty()) { - return Option.empty(); - } - - return Option.of(instants.get(instants.size() - 1).getTimestamp()); - } -} diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java index 6f5fe92158d72..b4304a4d5dce2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java @@ -18,15 +18,17 @@ package org.apache.hudi.avro; -import org.apache.avro.JsonProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.exception.SchemaCompatibilityException; +import org.apache.avro.JsonProperties; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.junit.jupiter.api.Test; +import java.math.BigDecimal; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -81,6 +83,11 @@ public class TestHoodieAvroUtils { + "{\"name\": \"nullable_field\",\"type\": [\"null\" ,\"string\"],\"default\": null}," + "{\"name\": \"non_nullable_field_with_default\",\"type\": \"string\", \"default\": \"dummy\"}]}"; + private static String SCHEMA_WITH_DECIMAL_FIELD = "{\"type\":\"record\",\"name\":\"record\",\"fields\":[" + + "{\"name\":\"key_col\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"decimal_col\",\"type\":[\"null\"," + + "{\"type\":\"bytes\",\"logicalType\":\"decimal\",\"precision\":8,\"scale\":4}],\"default\":null}]}"; + @Test public void testPropsPresent() { Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EXAMPLE_SCHEMA)); @@ -113,10 +120,10 @@ public void testDefaultValue() { rec.put("timestamp", 3.5); Schema schemaWithMetadata = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EVOLVED_SCHEMA)); GenericRecord rec1 = HoodieAvroUtils.rewriteRecord(rec, schemaWithMetadata); - assertEquals(rec1.get("new_col_not_nullable_default_dummy_val"), "dummy_val"); + assertEquals("dummy_val", rec1.get("new_col_not_nullable_default_dummy_val")); assertNull(rec1.get("new_col_nullable_wo_default")); assertNull(rec1.get("new_col_nullable_default_null")); - assertEquals(rec1.get("new_col_nullable_default_dummy_val"), "dummy_val"); + assertEquals("dummy_val", rec1.get("new_col_nullable_default_dummy_val")); assertNull(rec1.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); } @@ -128,7 +135,7 @@ public void testDefaultValueWithSchemaEvolution() { rec.put("pii_col", "val2"); rec.put("timestamp", 3.5); GenericRecord rec1 = HoodieAvroUtils.rewriteRecord(rec, new Schema.Parser().parse(EVOLVED_SCHEMA)); - assertEquals(rec1.get("new_col_not_nullable_default_dummy_val"), "dummy_val"); + assertEquals("dummy_val", rec1.get("new_col_not_nullable_default_dummy_val")); assertNull(rec1.get("new_col_nullable_wo_default")); } @@ -163,7 +170,7 @@ public void testNonNullableFieldWithDefault() { rec.put("pii_col", "val2"); rec.put("timestamp", 3.5); GenericRecord rec1 = HoodieAvroUtils.rewriteRecord(rec, new Schema.Parser().parse(SCHEMA_WITH_NON_NULLABLE_FIELD_WITH_DEFAULT)); - assertEquals(rec1.get("non_nullable_field_with_default"), "dummy"); + assertEquals("dummy", rec1.get("non_nullable_field_with_default")); } @Test @@ -206,9 +213,9 @@ public void testJsonNodeNullWithDefaultValues() { @Test public void testAddingAndRemovingMetadataFields() { Schema schemaWithMetaCols = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EXAMPLE_SCHEMA)); - assertEquals(schemaWithMetaCols.getFields().size(), NUM_FIELDS_IN_EXAMPLE_SCHEMA + HoodieRecord.HOODIE_META_COLUMNS.size()); + assertEquals(NUM_FIELDS_IN_EXAMPLE_SCHEMA + HoodieRecord.HOODIE_META_COLUMNS.size(), schemaWithMetaCols.getFields().size()); Schema schemaWithoutMetaCols = HoodieAvroUtils.removeMetadataFields(schemaWithMetaCols); - assertEquals(schemaWithoutMetaCols.getFields().size(), NUM_FIELDS_IN_EXAMPLE_SCHEMA); + assertEquals(NUM_FIELDS_IN_EXAMPLE_SCHEMA, schemaWithoutMetaCols.getFields().size()); } @Test @@ -219,7 +226,7 @@ public void testGetNestedFieldVal() { rec.put("pii_col", "val2"); Object rowKey = HoodieAvroUtils.getNestedFieldVal(rec, "_row_key", true); - assertEquals(rowKey, "key1"); + assertEquals("key1", rowKey); Object rowKeyNotExist = HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", true); assertNull(rowKeyNotExist); @@ -240,4 +247,21 @@ public void testGetNestedFieldVal() { } } + @Test + public void testGetNestedFieldValWithDecimalFiled() { + GenericRecord rec = new GenericData.Record(new Schema.Parser().parse(SCHEMA_WITH_DECIMAL_FIELD)); + rec.put("key_col", "key"); + BigDecimal bigDecimal = new BigDecimal("1234.5678"); + ByteBuffer byteBuffer = ByteBuffer.wrap(bigDecimal.unscaledValue().toByteArray()); + rec.put("decimal_col", byteBuffer); + + Object decimalCol = HoodieAvroUtils.getNestedFieldVal(rec, "decimal_col", true); + assertEquals(bigDecimal, decimalCol); + + Object obj = rec.get(1); + assertTrue(obj instanceof ByteBuffer); + ByteBuffer buffer = (ByteBuffer) obj; + assertEquals(0, buffer.position()); + } + } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieMapPair.java b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieMapPair.java new file mode 100644 index 0000000000000..86b1a213ba639 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieMapPair.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHoodieMapPair { + + private static final String KEY1 = "key1"; + private static final String KEY2 = "key2"; + private static final String KEY3 = "key3"; + private static final String KEY4 = "key4"; + private static final String KEY5 = "key5"; + + private static final String STRING_VALUE1 = "value1"; + private static final String STRING_VALUE2 = "value2"; + private static final String STRING_VALUE3 = "value3"; + private static final String STRING_VALUE4 = "value4"; + private static final String STRING_VALUE5 = "value5"; + private static final String STRING_VALUE6 = "value6"; + + private static final int INTEGER_VALUE1 = 1; + private static final int INTEGER_VALUE2 = 2; + private static final int INTEGER_VALUE3 = 3; + private static final int INTEGER_VALUE4 = 4; + private static final int INTEGER_VALUE5 = 5; + + private static List> TEST_PAIRS; + private static HoodiePairData TEST_HOODIE_MAP_PAIR; + + @BeforeAll + public static void setup() { + TEST_PAIRS = constructPairs(); + TEST_HOODIE_MAP_PAIR = constructTestMapPairData(TEST_PAIRS); + } + + @Test + public void testKeys() { + assertHoodieDataEquals(Arrays.asList(KEY1, KEY2, KEY3, KEY4), TEST_HOODIE_MAP_PAIR.keys()); + } + + @Test + public void testValues() { + assertHoodieDataEquals(Arrays.asList( + STRING_VALUE1, STRING_VALUE2, STRING_VALUE3, STRING_VALUE4, STRING_VALUE5, STRING_VALUE6), + TEST_HOODIE_MAP_PAIR.values()); + } + + @Test + public void testCount() { + assertEquals(6, TEST_HOODIE_MAP_PAIR.count()); + } + + @Test + public void testCountByKey() { + Map expectedResultMap = new HashMap<>(); + expectedResultMap.put(KEY1, 2L); + expectedResultMap.put(KEY2, 2L); + expectedResultMap.put(KEY3, 1L); + expectedResultMap.put(KEY4, 1L); + + assertEquals(expectedResultMap, TEST_HOODIE_MAP_PAIR.countByKey()); + } + + @Test + public void testMap() { + assertHoodieDataEquals(Arrays.asList( + "key1,value1", "key1,value2", "key2,value3", "key2,value4", "key3,value5", "key4,value6"), + TEST_HOODIE_MAP_PAIR.map(pair -> pair.getKey() + "," + pair.getValue())); + } + + @Test + public void testMapToPair() { + Map> expectedResultMap = new HashMap<>(); + expectedResultMap.put("key10", Arrays.asList(1, 2)); + expectedResultMap.put("key20", Arrays.asList(3, 4)); + expectedResultMap.put("key30", Arrays.asList(5)); + expectedResultMap.put("key40", Arrays.asList(6)); + assertEquals(expectedResultMap, HoodieMapPair.getMapPair( + TEST_HOODIE_MAP_PAIR.mapToPair( + pair -> { + String value = pair.getValue(); + return new ImmutablePair<>(pair.getKey() + "0", + Integer.parseInt(String.valueOf(value.charAt(value.length() - 1)))); + }))); + } + + @Test + public void testLeftOuterJoinSingleValuePerKey() { + HoodiePairData pairData1 = constructTestMapPairData(Arrays.asList( + ImmutablePair.of(KEY1, STRING_VALUE1), + ImmutablePair.of(KEY2, STRING_VALUE2), + ImmutablePair.of(KEY3, STRING_VALUE3), + ImmutablePair.of(KEY4, STRING_VALUE4) + )); + + HoodiePairData pairData2 = constructTestMapPairData(Arrays.asList( + ImmutablePair.of(KEY1, INTEGER_VALUE1), + ImmutablePair.of(KEY2, INTEGER_VALUE2), + ImmutablePair.of(KEY5, INTEGER_VALUE3) + )); + + Map>>> expectedResultMap = new HashMap<>(); + expectedResultMap.put(KEY1, Arrays.asList( + ImmutablePair.of(STRING_VALUE1, Option.of(INTEGER_VALUE1)))); + expectedResultMap.put(KEY2, Arrays.asList( + ImmutablePair.of(STRING_VALUE2, Option.of(INTEGER_VALUE2)))); + expectedResultMap.put(KEY3, Arrays.asList( + ImmutablePair.of(STRING_VALUE3, Option.empty()))); + expectedResultMap.put(KEY4, Arrays.asList( + ImmutablePair.of(STRING_VALUE4, Option.empty()))); + + assertEquals(expectedResultMap, + HoodieMapPair.getMapPair(pairData1.leftOuterJoin(pairData2))); + } + + @Test + public void testLeftOuterJoinMultipleValuesPerKey() { + HoodiePairData otherPairData = constructTestMapPairData(Arrays.asList( + ImmutablePair.of(KEY1, INTEGER_VALUE1), + ImmutablePair.of(KEY2, INTEGER_VALUE2), + ImmutablePair.of(KEY2, INTEGER_VALUE3), + ImmutablePair.of(KEY3, INTEGER_VALUE4), + ImmutablePair.of(KEY5, INTEGER_VALUE5) + )); + + Map>>> expectedResultMap = new HashMap<>(); + expectedResultMap.put(KEY1, Arrays.asList( + ImmutablePair.of(STRING_VALUE1, Option.of(INTEGER_VALUE1)), + ImmutablePair.of(STRING_VALUE2, Option.of(INTEGER_VALUE1)))); + expectedResultMap.put(KEY2, Arrays.asList( + ImmutablePair.of(STRING_VALUE3, Option.of(INTEGER_VALUE2)), + ImmutablePair.of(STRING_VALUE3, Option.of(INTEGER_VALUE3)), + ImmutablePair.of(STRING_VALUE4, Option.of(INTEGER_VALUE2)), + ImmutablePair.of(STRING_VALUE4, Option.of(INTEGER_VALUE3)))); + expectedResultMap.put(KEY3, Arrays.asList( + ImmutablePair.of(STRING_VALUE5, Option.of(INTEGER_VALUE4)))); + expectedResultMap.put(KEY4, Arrays.asList( + ImmutablePair.of(STRING_VALUE6, Option.empty()))); + + assertEquals(expectedResultMap, + HoodieMapPair.getMapPair(TEST_HOODIE_MAP_PAIR.leftOuterJoin(otherPairData))); + } + + private static List> constructPairs() { + return Arrays.asList( + ImmutablePair.of(KEY1, STRING_VALUE1), + ImmutablePair.of(KEY1, STRING_VALUE2), + ImmutablePair.of(KEY2, STRING_VALUE3), + ImmutablePair.of(KEY2, STRING_VALUE4), + ImmutablePair.of(KEY3, STRING_VALUE5), + ImmutablePair.of(KEY4, STRING_VALUE6) + ); + } + + private static HoodiePairData constructTestMapPairData( + final List> pairs) { + Map> map = new HashMap<>(); + addPairsToMap(map, pairs); + return HoodieMapPair.of(map); + } + + private static void addPairsToMap( + Map> map, final List> pairs) { + for (Pair pair : pairs) { + String key = pair.getKey(); + V value = pair.getValue(); + List list = map.computeIfAbsent(key, k -> new ArrayList<>()); + list.add(value); + } + } + + private void assertHoodieDataEquals( + List expectedList, HoodieData hoodieData) { + assertHoodieDataEquals(expectedList, hoodieData, Comparator.naturalOrder()); + } + + private void assertHoodieDataEquals( + List expectedList, HoodieData hoodieData, Comparator comparator) { + assertEquals(expectedList, + hoodieData.collectAsList().stream().sorted(comparator).collect(Collectors.toList()) + ); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java index c345cc7af5248..65c729e7aaed3 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.exception.HoodieException; @@ -50,7 +51,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.apache.hudi.common.table.timeline.HoodieActiveTimeline.COMMIT_FORMATTER; +import static org.apache.hudi.common.model.HoodieFileFormat.HOODIE_LOG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNull; @@ -78,14 +79,14 @@ public void setUp() throws IOException { @Test public void testMakeDataFileName() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatInstantTime(new Date()); String fileName = UUID.randomUUID().toString(); assertEquals(FSUtils.makeDataFileName(instantTime, TEST_WRITE_TOKEN, fileName), fileName + "_" + TEST_WRITE_TOKEN + "_" + instantTime + BASE_FILE_EXTENSION); } @Test public void testMaskFileName() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatInstantTime(new Date()); int taskPartitionId = 2; assertEquals(FSUtils.maskWithoutFileId(instantTime, taskPartitionId), "*_" + taskPartitionId + "_" + instantTime + BASE_FILE_EXTENSION); } @@ -153,15 +154,18 @@ public void testProcessFiles() throws Exception { @Test public void testGetCommitTime() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatInstantTime(new Date()); String fileName = UUID.randomUUID().toString(); String fullFileName = FSUtils.makeDataFileName(instantTime, TEST_WRITE_TOKEN, fileName); assertEquals(instantTime, FSUtils.getCommitTime(fullFileName)); + // test log file name + fullFileName = FSUtils.makeLogFileName(fileName, HOODIE_LOG.getFileExtension(), instantTime, 1, TEST_WRITE_TOKEN); + assertEquals(instantTime, FSUtils.getCommitTime(fullFileName)); } @Test public void testGetFileNameWithoutMeta() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatInstantTime(new Date()); String fileName = UUID.randomUUID().toString(); String fullFileName = FSUtils.makeDataFileName(instantTime, TEST_WRITE_TOKEN, fileName); assertEquals(fileName, FSUtils.getFileId(fullFileName)); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java index a346a673a017b..186ac62d372b7 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java @@ -45,6 +45,7 @@ public void testStorageSchemes() { assertFalse(StorageSchemes.isAppendSupported("cosn")); assertFalse(StorageSchemes.isAppendSupported("dbfs")); assertFalse(StorageSchemes.isAppendSupported("cos")); + assertTrue(StorageSchemes.isAppendSupported("jfs")); assertFalse(StorageSchemes.isAppendSupported("bos")); assertFalse(StorageSchemes.isAppendSupported("ks3")); assertTrue(StorageSchemes.isAppendSupported("ofs")); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestDefaultHoodieRecordPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestDefaultHoodieRecordPayload.java index 5be0961ca5351..87d4e746d81cb 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestDefaultHoodieRecordPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestDefaultHoodieRecordPayload.java @@ -77,8 +77,8 @@ public void testActiveRecords() throws IOException { assertEquals(payload1.preCombine(payload2, props), payload2); assertEquals(payload2.preCombine(payload1, props), payload2); - assertEquals(record1, payload1.getInsertValue(schema).get()); - assertEquals(record2, payload2.getInsertValue(schema).get()); + assertEquals(record1, payload1.getInsertValue(schema, props).get()); + assertEquals(record2, payload2.getInsertValue(schema, props).get()); assertEquals(payload1.combineAndGetUpdateValue(record2, schema, props).get(), record2); assertEquals(payload2.combineAndGetUpdateValue(record1, schema, props).get(), record2); @@ -103,8 +103,8 @@ public void testDeletedRecord() throws IOException { assertEquals(payload1.preCombine(payload2, props), payload2); assertEquals(payload2.preCombine(payload1, props), payload2); - assertEquals(record1, payload1.getInsertValue(schema).get()); - assertFalse(payload2.getInsertValue(schema).isPresent()); + assertEquals(record1, payload1.getInsertValue(schema, props).get()); + assertFalse(payload2.getInsertValue(schema, props).isPresent()); assertEquals(payload1.combineAndGetUpdateValue(delRecord1, schema, props).get(), delRecord1); assertFalse(payload2.combineAndGetUpdateValue(record1, schema, props).isPresent()); @@ -142,4 +142,20 @@ public void testGetEventTimeInMetadata(long eventTime) throws IOException { assertEquals(eventTime, Long.parseLong(payload2.getMetadata().get().get(DefaultHoodieRecordPayload.METADATA_EVENT_TIME_KEY))); } + + @ParameterizedTest + @ValueSource(longs = {1L, 1612542030000L}) + public void testGetEventTimeInMetadataForInserts(long eventTime) throws IOException { + GenericRecord record = new GenericData.Record(schema); + + record.put("id", "1"); + record.put("partition", "partition0"); + record.put("ts", eventTime); + record.put("_hoodie_is_deleted", false); + DefaultHoodieRecordPayload payload = new DefaultHoodieRecordPayload(record, eventTime); + payload.getInsertValue(schema, props); + assertTrue(payload.getMetadata().isPresent()); + assertEquals(eventTime, + Long.parseLong(payload.getMetadata().get().get(DefaultHoodieRecordPayload.METADATA_EVENT_TIME_KEY))); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java index 7136ce7d372bb..8fb9dddaa2e86 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java @@ -19,14 +19,13 @@ package org.apache.hudi.common.model; import org.apache.hudi.common.fs.FSUtils; - +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.Test; import java.util.Date; import java.util.UUID; -import static org.apache.hudi.common.table.timeline.HoodieActiveTimeline.COMMIT_FORMATTER; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -37,7 +36,7 @@ public class TestHoodieWriteStat { @Test public void testSetPaths() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatInstantTime(new Date()); String basePathString = "/data/tables/some-hoodie-table"; String partitionPathString = "2017/12/31"; String fileName = UUID.randomUUID().toString(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java index a8c0321929563..c6eee05b87e6d 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.model; +import org.apache.avro.JsonProperties; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; @@ -126,4 +127,34 @@ public void testDeletedRecord() throws IOException { assertEquals(payload1.combineAndGetUpdateValue(delRecord1, schema).get(), record2); assertFalse(payload2.combineAndGetUpdateValue(record1, schema).isPresent()); } + + @Test + public void testNullColumn() throws IOException { + Schema avroSchema = Schema.createRecord(Arrays.asList( + new Schema.Field("id", Schema.createUnion(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.NULL)), "", JsonProperties.NULL_VALUE), + new Schema.Field("name", Schema.createUnion(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.NULL)), "", JsonProperties.NULL_VALUE), + new Schema.Field("age", Schema.createUnion(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.NULL)), "", JsonProperties.NULL_VALUE), + new Schema.Field("job", Schema.createUnion(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.NULL)), "", JsonProperties.NULL_VALUE) + )); + GenericRecord record1 = new GenericData.Record(avroSchema); + record1.put("id", "1"); + record1.put("name", "aa"); + record1.put("age", "1"); + record1.put("job", "1"); + + GenericRecord record2 = new GenericData.Record(avroSchema); + record2.put("id", "1"); + record2.put("name", "bb"); + record2.put("age", "2"); + record2.put("job", null); + + GenericRecord record3 = new GenericData.Record(avroSchema); + record3.put("id", "1"); + record3.put("name", "bb"); + record3.put("age", "2"); + record3.put("job", "1"); + + OverwriteNonDefaultsWithLatestAvroPayload payload2 = new OverwriteNonDefaultsWithLatestAvroPayload(record2, 1); + assertEquals(payload2.combineAndGetUpdateValue(record1, avroSchema).get(), record3); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java index 121e173c655c9..586a451065823 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java @@ -59,10 +59,10 @@ public void checkMetadata() { @Test public void checkSerDe() { // check if this object is serialized and de-serialized, we are able to read from the file system - HoodieTableMetaClient deseralizedMetaClient = + HoodieTableMetaClient deserializedMetaClient = HoodieTestUtils.serializeDeserialize(metaClient, HoodieTableMetaClient.class); - assertNotNull(deseralizedMetaClient); - HoodieActiveTimeline commitTimeline = deseralizedMetaClient.getActiveTimeline(); + assertNotNull(deserializedMetaClient); + HoodieActiveTimeline commitTimeline = deserializedMetaClient.getActiveTimeline(); HoodieInstant instant = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); commitTimeline.createNewInstant(instant); commitTimeline.saveAsComplete(instant, Option.of("test-detail".getBytes())); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java b/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java index 5c4c911e1576d..5f2d6928cbaaa 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java @@ -25,7 +25,7 @@ import org.apache.hudi.common.testutils.MockHoodieTimeline; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; - +import org.apache.hudi.exception.HoodieException; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -33,10 +33,15 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.function.BiConsumer; import java.util.function.Supplier; import java.util.stream.Collectors; @@ -428,6 +433,45 @@ public void testReplaceActionsTimeline() { assertEquals(HoodieTimeline.REPLACE_COMMIT_ACTION, validReplaceInstants.get(0).getAction()); } + @Test + public void testCreateNewInstantTime() throws Exception { + String lastInstantTime = HoodieActiveTimeline.createNewInstantTime(); + for (int i = 0; i < 3; ++i) { + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + assertTrue(HoodieTimeline.compareTimestamps(lastInstantTime, HoodieTimeline.LESSER_THAN, newInstantTime)); + lastInstantTime = newInstantTime; + } + + // All zero timestamp can be parsed + HoodieActiveTimeline.parseInstantTime("00000000000000"); + + // Multiple thread test + final int numChecks = 100000; + final int numThreads = 100; + final long milliSecondsInYear = 365 * 24 * 3600 * 1000; + ExecutorService executorService = Executors.newFixedThreadPool(numThreads); + List futures = new ArrayList<>(numThreads); + for (int idx = 0; idx < numThreads; ++idx) { + futures.add(executorService.submit(() -> { + Date date = new Date(System.currentTimeMillis() + (int)(Math.random() * numThreads) * milliSecondsInYear); + final String expectedFormat = HoodieActiveTimeline.formatInstantTime(date); + for (int tidx = 0; tidx < numChecks; ++tidx) { + final String curFormat = HoodieActiveTimeline.formatInstantTime(date); + if (!curFormat.equals(expectedFormat)) { + throw new HoodieException("Format error: expected=" + expectedFormat + ", curFormat=" + curFormat); + } + } + })); + } + + executorService.shutdown(); + assertTrue(executorService.awaitTermination(10, TimeUnit.SECONDS)); + // required to catch exceptions + for (Future f : futures) { + f.get(); + } + } + /** * Returns an exhaustive list of all possible HoodieInstant. * @return list of HoodieInstant diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java index dee15e22d9fa5..924c6724e7b22 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java @@ -108,7 +108,7 @@ public static Stream configParams() { @BeforeEach public void setup() throws IOException { - metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH); + metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH, false); basePath = metaClient.getBasePath(); refreshFsView(); } @@ -344,6 +344,11 @@ private void checkExternalFile(HoodieFileStatus srcFileStatus, Option protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingDataFile, boolean isCompactionInFlight, int expTotalFileSlices, int expTotalDataFiles, boolean includeInvalidAndInflight, boolean testBootstrap) throws Exception { + + if (testBootstrap) { + metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH, testBootstrap); + } + String partitionPath = "2016/05/01"; new File(basePath + "/" + partitionPath).mkdirs(); String fileId = UUID.randomUUID().toString(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java index bb6c0b491a30c..491ad32f90dfb 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java @@ -23,6 +23,7 @@ import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -39,6 +40,8 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; import java.io.RandomAccessFile; @@ -48,17 +51,23 @@ import java.nio.file.Paths; import java.nio.file.attribute.FileTime; import java.time.Instant; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCleanMetadata; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCleanerPlan; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCompactionPlan; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeRequestedReplaceMetadata; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeRestoreMetadata; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeRollbackMetadata; public class FileCreateUtils { + private static final Logger LOG = LogManager.getLogger(FileCreateUtils.class); + private static final String WRITE_TOKEN = "1-0-1"; private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); @@ -123,12 +132,25 @@ private static void deleteMetaFile(String basePath, String instantTime, String s } } + private static void deleteMetaFile(String basePath, String instantTime, String suffix) throws IOException { + Path parentPath = Paths.get(basePath, HoodieTableMetaClient.METAFOLDER_NAME); + Path metaFilePath = parentPath.resolve(instantTime + suffix); + if (Files.exists(metaFilePath)) { + Files.delete(metaFilePath); + } + } + public static void createCommit(String basePath, String instantTime) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION); } - public static void createCommit(String basePath, String instantTime, HoodieCommitMetadata metadata) throws IOException { - createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION, metadata.toJsonString().getBytes(StandardCharsets.UTF_8)); + public static void createCommit(String basePath, String instantTime, Option metadata) throws IOException { + if (metadata.isPresent()) { + createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION, + metadata.get().toJsonString().getBytes(StandardCharsets.UTF_8)); + } else { + createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION); + } } public static void createCommit(String basePath, String instantTime, FileSystem fs) throws IOException { @@ -143,6 +165,10 @@ public static void createInflightCommit(String basePath, String instantTime) thr createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_COMMIT_EXTENSION); } + public static void createDeltaCommit(String basePath, String instantTime, HoodieCommitMetadata metadata) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION, metadata.toJsonString().getBytes(StandardCharsets.UTF_8)); + } + public static void createDeltaCommit(String basePath, String instantTime) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION); } @@ -159,6 +185,10 @@ public static void createInflightDeltaCommit(String basePath, String instantTime createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_DELTA_COMMIT_EXTENSION); } + public static void createInflightReplaceCommit(String basePath, String instantTime) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_REPLACE_COMMIT_EXTENSION); + } + public static void createReplaceCommit(String basePath, String instantTime, HoodieReplaceCommitMetadata metadata) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.REPLACE_COMMIT_EXTENSION, metadata.toJsonString().getBytes(StandardCharsets.UTF_8)); } @@ -203,6 +233,10 @@ public static void createRollbackFile(String basePath, String instantTime, Hoodi createMetaFile(basePath, instantTime, HoodieTimeline.ROLLBACK_EXTENSION, serializeRollbackMetadata(hoodieRollbackMetadata).get()); } + public static void createRestoreFile(String basePath, String instantTime, HoodieRestoreMetadata hoodieRestoreMetadata) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.RESTORE_ACTION, serializeRestoreMetadata(hoodieRestoreMetadata).get()); + } + private static void createAuxiliaryMetaFile(String basePath, String instantTime, String suffix) throws IOException { Path parentPath = Paths.get(basePath, HoodieTableMetaClient.AUXILIARYFOLDER_NAME); Files.createDirectories(parentPath); @@ -216,6 +250,10 @@ public static void createRequestedCompaction(String basePath, String instantTime createAuxiliaryMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_COMPACTION_EXTENSION); } + public static void createInflightCompaction(String basePath, String instantTime) throws IOException { + createAuxiliaryMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_COMPACTION_EXTENSION); + } + public static void createPartitionMetaFile(String basePath, String partitionPath) throws IOException { Path parentPath = Paths.get(basePath, partitionPath); Files.createDirectories(parentPath); @@ -298,6 +336,10 @@ public static void deleteDeltaCommit(String basePath, String instantTime) throws removeMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION); } + public static void deleteReplaceCommit(String basePath, String instantTime) throws IOException { + removeMetaFile(basePath, instantTime, HoodieTimeline.REPLACE_COMMIT_EXTENSION); + } + public static long getTotalMarkerFileCount(String basePath, String partitionPath, String instantTime, IOType ioType) throws IOException { Path parentPath = Paths.get(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTime, partitionPath); if (Files.notExists(parentPath)) { @@ -307,6 +349,13 @@ public static long getTotalMarkerFileCount(String basePath, String partitionPath .endsWith(String.format("%s.%s", HoodieTableMetaClient.MARKER_EXTN, ioType))).count(); } + public static List getPartitionPaths(Path basePath) throws IOException { + if (Files.notExists(basePath)) { + return Collections.emptyList(); + } + return Files.list(basePath).filter(entry -> !entry.getFileName().toString().equals(HoodieTableMetaClient.METAFOLDER_NAME)).collect(Collectors.toList()); + } + /** * Find total basefiles for passed in paths. */ diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java index 76fdf18d4a820..95188bb0b68d9 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java @@ -75,7 +75,11 @@ public static void deleteFile(File fileToDelete) throws IOException { } public static List listRecursive(FileSystem fs, Path path) throws IOException { - RemoteIterator itr = fs.listFiles(path, true); + return listFiles(fs, path, true); + } + + public static List listFiles(FileSystem fs, Path path, boolean recursive) throws IOException { + RemoteIterator itr = fs.listFiles(path, recursive); List statuses = new ArrayList<>(); while (itr.hasNext()) { statuses.add(itr.next()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FixtureUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FixtureUtils.java new file mode 100644 index 0000000000000..6dfe0da797f8e --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FixtureUtils.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.testutils; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.URL; +import java.nio.file.Path; +import java.util.Objects; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +public final class FixtureUtils { + + public static Path prepareFixtureTable(URL fixtureResource, Path basePath) throws IOException { + File zippedFixtureTable = new File(fixtureResource.getFile()); + try (ZipInputStream zis = new ZipInputStream(new FileInputStream(zippedFixtureTable))) { + byte[] buffer = new byte[1024]; + ZipEntry zipEntry = zis.getNextEntry(); + Path tableBasePath = basePath.resolve(Objects.requireNonNull(zipEntry).getName() + .replaceAll(File.separator + "$", "")); + while (zipEntry != null) { + File newFile = newFile(basePath.toFile(), zipEntry); + if (zipEntry.isDirectory()) { + if (!newFile.isDirectory() && !newFile.mkdirs()) { + throw new IOException("Failed to create directory " + newFile); + } + } else { + // fix for Windows-created archives + File parent = newFile.getParentFile(); + if (!parent.isDirectory() && !parent.mkdirs()) { + throw new IOException("Failed to create directory " + parent); + } + + // write file content + try (FileOutputStream fos = new FileOutputStream(newFile)) { + int len; + while ((len = zis.read(buffer)) > 0) { + fos.write(buffer, 0, len); + } + } + } + zipEntry = zis.getNextEntry(); + } + zis.closeEntry(); + return tableBasePath; + } + } + + public static File newFile(File destinationDir, ZipEntry zipEntry) throws IOException { + File destFile = new File(destinationDir, zipEntry.getName()); + + String destDirPath = destinationDir.getCanonicalPath(); + String destFilePath = destFile.getCanonicalPath(); + + if (!destFilePath.startsWith(destDirPath + File.separator)) { + throw new IOException("Entry is outside of the target dir: " + zipEntry.getName()); + } + + return destFile; + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java index 973881696154b..311c131d432c6 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java @@ -34,12 +34,17 @@ */ public class HoodieCommonTestHarness { + protected String tableName = null; protected String basePath = null; protected transient HoodieTestDataGenerator dataGen = null; protected transient HoodieTableMetaClient metaClient; @TempDir public java.nio.file.Path tempDir; + protected void setTableName(String tableName) { + this.tableName = tableName; + } + /** * Initializes basePath. */ diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java index 68d1f2dd32c2c..e988c9df618cc 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java @@ -31,6 +31,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; @@ -47,6 +48,7 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.orc.TypeDescription; import java.io.IOException; import java.io.Serializable; @@ -129,10 +131,12 @@ public class HoodieTestDataGenerator { public static final Schema AVRO_SCHEMA = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); + public static final TypeDescription ORC_SCHEMA = AvroOrcUtils.createOrcSchema(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); public static final Schema AVRO_SCHEMA_WITH_METADATA_FIELDS = HoodieAvroUtils.addMetadataFields(AVRO_SCHEMA); public static final Schema AVRO_SHORT_TRIP_SCHEMA = new Schema.Parser().parse(SHORT_TRIP_SCHEMA); public static final Schema AVRO_TRIP_SCHEMA = new Schema.Parser().parse(TRIP_SCHEMA); + public static final TypeDescription ORC_TRIP_SCHEMA = AvroOrcUtils.createOrcSchema(new Schema.Parser().parse(TRIP_SCHEMA)); public static final Schema FLATTENED_AVRO_SCHEMA = new Schema.Parser().parse(TRIP_FLATTENED_SCHEMA); private static final Random RAND = new Random(46474747); @@ -156,6 +160,7 @@ public HoodieTestDataGenerator(String[] partitionPaths, Map(); existingKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap); numKeysBySchema = new HashMap<>(); + numKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap.size()); } /** @@ -840,8 +845,8 @@ public int getNumExistingKeys(String schemaStr) { public static class KeyPartition implements Serializable { - HoodieKey key; - String partitionPath; + public HoodieKey key; + public String partitionPath; } public void close() { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java index e6c488e497737..95d0657cb2082 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java @@ -19,52 +19,81 @@ package org.apache.hudi.common.testutils; +import org.apache.hudi.avro.model.HoodieActionInstant; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieInstantInfo; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.avro.model.HoodieSavepointMetadata; +import org.apache.hudi.avro.model.HoodieSavepointPartitionMetadata; +import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler; import org.apache.hudi.common.util.CompactionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Paths; import java.time.Instant; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Random; import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; import static java.time.temporal.ChronoUnit.SECONDS; -import static org.apache.hudi.common.table.timeline.HoodieActiveTimeline.COMMIT_FORMATTER; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.apache.hudi.common.model.WriteOperationType.CLUSTER; +import static org.apache.hudi.common.model.WriteOperationType.COMPACT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION; import static org.apache.hudi.common.testutils.FileCreateUtils.baseFileName; import static org.apache.hudi.common.testutils.FileCreateUtils.createCleanFile; import static org.apache.hudi.common.testutils.FileCreateUtils.createCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createDeltaCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightCleanFile; import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightCommit; +import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightCompaction; import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightDeltaCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightReplaceCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightRollbackFile; @@ -75,11 +104,22 @@ import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedCompaction; import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedDeltaCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedReplaceCommit; +import static org.apache.hudi.common.testutils.FileCreateUtils.createRestoreFile; import static org.apache.hudi.common.testutils.FileCreateUtils.createRollbackFile; import static org.apache.hudi.common.testutils.FileCreateUtils.logFileName; +import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata; +import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; +import static org.apache.hudi.common.util.CommitUtils.buildMetadata; +import static org.apache.hudi.common.util.CommitUtils.getCommitActionType; +import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; public class HoodieTestTable { + private static final Logger LOG = LogManager.getLogger(HoodieTestTable.class); + private static final Random RANDOM = new Random(); + protected static HoodieTestTableState testTableState; + private final List inflightCommits = new ArrayList<>(); + protected final String basePath; protected final FileSystem fs; protected HoodieTableMetaClient metaClient; @@ -94,6 +134,7 @@ protected HoodieTestTable(String basePath, FileSystem fs, HoodieTableMetaClient } public static HoodieTestTable of(HoodieTableMetaClient metaClient) { + testTableState = HoodieTestTableState.of(); return new HoodieTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient); } @@ -106,7 +147,7 @@ public static String makeNewCommitTime() { } public static String makeNewCommitTime(Instant dateTime) { - return COMMIT_FORMATTER.format(Date.from(dateTime)); + return HoodieActiveTimeline.formatInstantTime(Date.from(dateTime)); } public static List makeIncrementalCommitTimes(int num) { @@ -114,42 +155,82 @@ public static List makeIncrementalCommitTimes(int num) { } public static List makeIncrementalCommitTimes(int num, int firstOffsetSeconds) { + return makeIncrementalCommitTimes(num, firstOffsetSeconds, 0); + } + + public static List makeIncrementalCommitTimes(int num, int firstOffsetSeconds, int deltaSecs) { final Instant now = Instant.now(); return IntStream.range(0, num) - .mapToObj(i -> makeNewCommitTime(now.plus(firstOffsetSeconds + i, SECONDS))) + .mapToObj(i -> makeNewCommitTime(now.plus(deltaSecs == 0 ? (firstOffsetSeconds + i) : (i == 0 ? (firstOffsetSeconds) : (i * deltaSecs) + i), SECONDS))) .collect(Collectors.toList()); } public HoodieTestTable addRequestedCommit(String instantTime) throws Exception { createRequestedCommit(basePath, instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } public HoodieTestTable addInflightCommit(String instantTime) throws Exception { createRequestedCommit(basePath, instantTime); createInflightCommit(basePath, instantTime); + inflightCommits.add(instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } - public HoodieTestTable addCommit(String instantTime) throws Exception { - createRequestedCommit(basePath, instantTime); - createInflightCommit(basePath, instantTime); - createCommit(basePath, instantTime); + public HoodieTestTable addInflightDeltaCommit(String instantTime) throws Exception { + createRequestedDeltaCommit(basePath, instantTime); + createInflightDeltaCommit(basePath, instantTime); + inflightCommits.add(instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } - public HoodieTestTable addCommit(String instantTime, HoodieCommitMetadata metadata) throws Exception { + public HoodieTestTable addCommit(String instantTime) throws Exception { + return addCommit(instantTime, Option.empty()); + } + + public HoodieTestTable addCommit(String instantTime, Option metadata) throws Exception { createRequestedCommit(basePath, instantTime); createInflightCommit(basePath, instantTime); createCommit(basePath, instantTime, metadata); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); + return this; + } + + public HoodieCommitMetadata createCommitMetadata(WriteOperationType operationType, String commitTime, + HoodieTestTableState testTableState) { + String actionType = getCommitActionType(operationType, metaClient.getTableType()); + return createCommitMetadata(operationType, commitTime, Collections.emptyMap(), testTableState, false, actionType); + } + + public HoodieCommitMetadata createCommitMetadata(WriteOperationType operationType, String commitTime, + HoodieTestTableState testTableState, boolean bootstrap) { + String actionType = getCommitActionType(operationType, metaClient.getTableType()); + return createCommitMetadata(operationType, commitTime, Collections.emptyMap(), testTableState, bootstrap, + actionType); + } + + public HoodieCommitMetadata createCommitMetadata(WriteOperationType operationType, String commitTime, + Map> partitionToReplaceFileIds, + HoodieTestTableState testTableState, boolean bootstrap, String action) { + List writeStats = generateHoodieWriteStatForPartition(testTableState.getPartitionToBaseFileInfoMap(commitTime), commitTime, bootstrap); + if (MERGE_ON_READ.equals(metaClient.getTableType()) && UPSERT.equals(operationType)) { + writeStats.addAll(generateHoodieWriteStatForPartitionLogFiles(testTableState.getPartitionToLogFileInfoMap(commitTime), commitTime, bootstrap)); + } + Map extraMetadata = createImmutableMap("test", "test"); + return buildMetadata(writeStats, partitionToReplaceFileIds, Option.of(extraMetadata), operationType, EMPTY_STRING, action); + } + + public HoodieTestTable moveInflightCommitToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + createCommit(basePath, instantTime, Option.of(metadata)); + } else { + createDeltaCommit(basePath, instantTime, metadata); + } + inflightCommits.remove(instantTime); + currentInstantTime = instantTime; return this; } @@ -158,7 +239,14 @@ public HoodieTestTable addDeltaCommit(String instantTime) throws Exception { createInflightDeltaCommit(basePath, instantTime); createDeltaCommit(basePath, instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); + return this; + } + + public HoodieTestTable addDeltaCommit(String instantTime, HoodieCommitMetadata metadata) throws Exception { + createRequestedDeltaCommit(basePath, instantTime); + createInflightDeltaCommit(basePath, instantTime); + createDeltaCommit(basePath, instantTime, metadata); + currentInstantTime = instantTime; return this; } @@ -171,14 +259,12 @@ public HoodieTestTable addReplaceCommit( createInflightReplaceCommit(basePath, instantTime, inflightReplaceMetadata); createReplaceCommit(basePath, instantTime, completeReplaceMetadata); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } public HoodieTestTable addRequestedReplace(String instantTime, Option requestedReplaceMetadata) throws Exception { createRequestedReplaceCommit(basePath, instantTime, requestedReplaceMetadata); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } @@ -186,7 +272,6 @@ public HoodieTestTable addInflightClean(String instantTime, HoodieCleanerPlan cl createRequestedCleanFile(basePath, instantTime, cleanerPlan); createInflightCleanFile(basePath, instantTime, cleanerPlan); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } @@ -195,14 +280,37 @@ public HoodieTestTable addClean(String instantTime, HoodieCleanerPlan cleanerPla createInflightCleanFile(basePath, instantTime, cleanerPlan); createCleanFile(basePath, instantTime, metadata); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } + public HoodieTestTable addClean(String instantTime) throws IOException { + HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant(EMPTY_STRING, EMPTY_STRING, EMPTY_STRING), EMPTY_STRING, new HashMap<>(), + CleanPlanV2MigrationHandler.VERSION, new HashMap<>()); + HoodieCleanStat cleanStats = new HoodieCleanStat( + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, + HoodieTestUtils.DEFAULT_PARTITION_PATHS[RANDOM.nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList(), + instantTime); + HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats)); + return HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata); + } + + public Pair getHoodieCleanMetadata(String commitTime, HoodieTestTableState testTableState) { + HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant(commitTime, CLEAN_ACTION, EMPTY_STRING), EMPTY_STRING, new HashMap<>(), + CleanPlanV2MigrationHandler.VERSION, new HashMap<>()); + List cleanStats = new ArrayList<>(); + for (Map.Entry> entry : testTableState.getPartitionToFileIdMapForCleaner(commitTime).entrySet()) { + cleanStats.add(new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, + entry.getKey(), entry.getValue(), entry.getValue(), Collections.emptyList(), commitTime)); + } + return Pair.of(cleanerPlan, convertCleanMetadata(commitTime, Option.of(0L), cleanStats)); + } + public HoodieTestTable addInflightRollback(String instantTime) throws IOException { createInflightRollbackFile(basePath, instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } @@ -210,14 +318,77 @@ public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata ro createInflightRollbackFile(basePath, instantTime); createRollbackFile(basePath, instantTime, rollbackMetadata); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } + public HoodieTestTable addRestore(String instantTime, HoodieRestoreMetadata restoreMetadata) throws IOException { + createRestoreFile(basePath, instantTime, restoreMetadata); + currentInstantTime = instantTime; + return this; + } + + public HoodieRollbackMetadata getRollbackMetadata(String instantTimeToDelete, Map> partitionToFilesMeta) throws Exception { + HoodieRollbackMetadata rollbackMetadata = new HoodieRollbackMetadata(); + rollbackMetadata.setCommitsRollback(Collections.singletonList(instantTimeToDelete)); + rollbackMetadata.setStartRollbackTime(instantTimeToDelete); + Map partitionMetadataMap = new HashMap<>(); + for (Map.Entry> entry : partitionToFilesMeta.entrySet()) { + HoodieRollbackPartitionMetadata rollbackPartitionMetadata = new HoodieRollbackPartitionMetadata(); + rollbackPartitionMetadata.setPartitionPath(entry.getKey()); + rollbackPartitionMetadata.setSuccessDeleteFiles(entry.getValue()); + rollbackPartitionMetadata.setFailedDeleteFiles(new ArrayList<>()); + rollbackPartitionMetadata.setWrittenLogFiles(getWrittenLogFiles(instantTimeToDelete, entry)); + long rollbackLogFileSize = 50 + RANDOM.nextInt(500); + String fileId = UUID.randomUUID().toString(); + String logFileName = logFileName(instantTimeToDelete, fileId, 0); + FileCreateUtils.createLogFile(basePath, entry.getKey(), instantTimeToDelete, fileId, 0, (int) rollbackLogFileSize); + rollbackPartitionMetadata.setRollbackLogFiles(createImmutableMap(logFileName, rollbackLogFileSize)); + partitionMetadataMap.put(entry.getKey(), rollbackPartitionMetadata); + } + rollbackMetadata.setPartitionMetadata(partitionMetadataMap); + rollbackMetadata.setInstantsRollback(Collections.singletonList(new HoodieInstantInfo(instantTimeToDelete, HoodieTimeline.ROLLBACK_ACTION))); + return rollbackMetadata; + } + + /** + * Return a map of log file name to file size that were expected to be rolled back in that partition. + */ + private Map getWrittenLogFiles(String instant, Map.Entry> entry) { + Map writtenLogFiles = new HashMap<>(); + for (String fileName : entry.getValue()) { + if (FSUtils.isLogFile(new Path(fileName))) { + if (testTableState.getPartitionToLogFileInfoMap(instant) != null + && testTableState.getPartitionToLogFileInfoMap(instant).containsKey(entry.getKey())) { + List> fileInfos = testTableState.getPartitionToLogFileInfoMap(instant).get(entry.getKey()); + for (Pair fileInfo : fileInfos) { + if (fileName.equals(logFileName(instant, fileInfo.getLeft(), fileInfo.getRight()[0]))) { + writtenLogFiles.put(fileName, Long.valueOf(fileInfo.getRight()[1])); + } + } + } + } + } + return writtenLogFiles; + } + + public HoodieSavepointMetadata getSavepointMetadata(String instant, Map> partitionToFilesMeta) { + HoodieSavepointMetadata savepointMetadata = new HoodieSavepointMetadata(); + savepointMetadata.setSavepointedAt(Long.valueOf(instant)); + Map partitionMetadataMap = new HashMap<>(); + for (Map.Entry> entry : partitionToFilesMeta.entrySet()) { + HoodieSavepointPartitionMetadata savepointPartitionMetadata = new HoodieSavepointPartitionMetadata(); + savepointPartitionMetadata.setPartitionPath(entry.getKey()); + savepointPartitionMetadata.setSavepointDataFile(entry.getValue()); + partitionMetadataMap.put(entry.getKey(), savepointPartitionMetadata); + } + savepointMetadata.setPartitionMetadata(partitionMetadataMap); + savepointMetadata.setSavepointedBy("test"); + return savepointMetadata; + } + public HoodieTestTable addRequestedCompaction(String instantTime) throws IOException { createRequestedCompaction(basePath, instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } @@ -235,6 +406,33 @@ public HoodieTestTable addRequestedCompaction(String instantTime, FileSlice... f return addRequestedCompaction(instantTime, plan); } + public HoodieTestTable addInflightCompaction(String instantTime, HoodieCommitMetadata commitMetadata) throws Exception { + List fileSlices = new ArrayList<>(); + for (Map.Entry> entry : commitMetadata.getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat stat: entry.getValue()) { + fileSlices.add(new FileSlice(entry.getKey(), instantTime, stat.getPath())); + } + } + this.addRequestedCompaction(instantTime, fileSlices.toArray(new FileSlice[0])); + createInflightCompaction(basePath, instantTime); + inflightCommits.add(instantTime); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addCompaction(String instantTime, HoodieCommitMetadata commitMetadata) throws Exception { + createRequestedCompaction(basePath, instantTime); + createInflightCompaction(basePath, instantTime); + return addCommit(instantTime, Option.of(commitMetadata)); + } + + public HoodieTestTable moveInflightCompactionToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + createCommit(basePath, instantTime, Option.of(metadata)); + inflightCommits.remove(instantTime); + currentInstantTime = instantTime; + return this; + } + public HoodieTestTable forCommit(String instantTime) { currentInstantTime = instantTime; return this; @@ -311,6 +509,13 @@ public HoodieTestTable withBaseFilesInPartition(String partition, int... lengths return this; } + public HoodieTestTable withBaseFilesInPartition(String partition, List> fileInfos) throws Exception { + for (Pair fileInfo : fileInfos) { + FileCreateUtils.createBaseFile(basePath, partition, currentInstantTime, fileInfo.getKey(), fileInfo.getValue()); + } + return this; + } + public String getFileIdWithLogFile(String partitionPath) throws Exception { String fileId = UUID.randomUUID().toString(); withLogFile(partitionPath, fileId); @@ -328,6 +533,13 @@ public HoodieTestTable withLogFile(String partitionPath, String fileId, int... v return this; } + public HoodieTestTable withLogFilesInPartition(String partition, List> fileInfos) throws Exception { + for (Pair fileInfo : fileInfos) { + FileCreateUtils.createLogFile(basePath, partition, currentInstantTime, fileInfo.getKey(), fileInfo.getValue()[0], fileInfo.getValue()[1]); + } + return this; + } + public boolean inflightCommitExists(String instantTime) { try { return fs.exists(getInflightCommitFilePath(instantTime)); @@ -388,6 +600,11 @@ public Path getPartitionPath(String partition) { return new Path(Paths.get(basePath, partition).toUri()); } + public List getAllPartitionPaths() throws IOException { + java.nio.file.Path basePathPath = Paths.get(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME).getParent().getParent(); + return FileCreateUtils.getPartitionPaths(basePathPath); + } + public Path getBaseFilePath(String partition, String fileId) { return new Path(Paths.get(basePath, partition, getBaseFileNameById(fileId)).toUri()); } @@ -396,6 +613,24 @@ public String getBaseFileNameById(String fileId) { return baseFileName(currentInstantTime, fileId); } + public Path getLogFilePath(String partition, String fileId, int version) { + return new Path(Paths.get(basePath, partition, getLogFileNameById(fileId, version)).toString()); + } + + public String getLogFileNameById(String fileId, int version) { + return logFileName(currentInstantTime, fileId, version); + } + + public List getEarliestFilesInPartition(String partition, int count) throws IOException { + List fileStatuses = Arrays.asList(listAllFilesInPartition(partition)); + fileStatuses.sort(Comparator.comparing(FileStatus::getModificationTime)); + return fileStatuses.subList(0, count).stream().map(entry -> entry.getPath().getName()).collect(Collectors.toList()); + } + + public List inflightCommits() { + return this.inflightCommits; + } + public FileStatus[] listAllBaseFiles() throws IOException { return listAllBaseFiles(HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()); } @@ -421,16 +656,502 @@ public FileStatus[] listAllBaseAndLogFiles() throws IOException { } public FileStatus[] listAllFilesInPartition(String partitionPath) throws IOException { - return FileSystemTestUtils.listRecursive(fs, new Path(Paths.get(basePath, partitionPath).toString())).toArray(new FileStatus[0]); + return FileSystemTestUtils.listRecursive(fs, new Path(Paths.get(basePath, partitionPath).toString())).stream() + .filter(entry -> { + boolean toReturn = true; + String fileName = entry.getPath().getName(); + if (fileName.equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)) { + toReturn = false; + } else { + for (String inflight : inflightCommits) { + if (fileName.contains(inflight)) { + toReturn = false; + break; + } + } + } + return toReturn; + }).toArray(FileStatus[]::new); } public FileStatus[] listAllFilesInTempFolder() throws IOException { return FileSystemTestUtils.listRecursive(fs, new Path(Paths.get(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME).toString())).toArray(new FileStatus[0]); } + public void deleteFilesInPartition(String partitionPath, List filesToDelete) throws IOException { + FileStatus[] allFiles = listAllFilesInPartition(partitionPath); + Arrays.stream(allFiles).filter(entry -> filesToDelete.contains(entry.getPath().getName())).forEach(entry -> { + try { + Files.delete(Paths.get(basePath, partitionPath, entry.getPath().getName())); + } catch (IOException e) { + throw new HoodieTestTableException(e); + } + }); + } + + public HoodieTestTable doRollback(String commitTimeToRollback, String commitTime) throws Exception { + metaClient = HoodieTableMetaClient.reload(metaClient); + Option commitMetadata = getMetadataForInstant(commitTimeToRollback); + if (!commitMetadata.isPresent()) { + throw new IllegalArgumentException("Instant to rollback not present in timeline: " + commitTimeToRollback); + } + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + HoodieRollbackMetadata rollbackMetadata = getRollbackMetadata(commitTimeToRollback, partitionFiles); + for (Map.Entry> entry : partitionFiles.entrySet()) { + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + return addRollback(commitTime, rollbackMetadata); + } + + public HoodieTestTable doRestore(String commitToRestoreTo, String restoreTime) throws Exception { + metaClient = HoodieTableMetaClient.reload(metaClient); + List commitsToRollback = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().findInstantsAfter(commitToRestoreTo).getReverseOrderedInstants().collect(Collectors.toList()); + Map> rollbackMetadataMap = new HashMap<>(); + for (HoodieInstant commitInstantToRollback: commitsToRollback) { + Option commitMetadata = getCommitMeta(commitInstantToRollback); + if (!commitMetadata.isPresent()) { + throw new IllegalArgumentException("Instant to rollback not present in timeline: " + commitInstantToRollback.getTimestamp()); + } + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + rollbackMetadataMap.put(commitInstantToRollback.getTimestamp(), + Collections.singletonList(getRollbackMetadata(commitInstantToRollback.getTimestamp(), partitionFiles))); + for (Map.Entry> entry : partitionFiles.entrySet()) { + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + } + + HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.convertRestoreMetadata(restoreTime,1000L, + commitsToRollback, rollbackMetadataMap); + return addRestore(restoreTime, restoreMetadata); + } + + public HoodieReplaceCommitMetadata doCluster(String commitTime, Map> partitionToReplaceFileIds, List partitions, int filesPerPartition) throws Exception { + HoodieTestTableState testTableState = getTestTableStateWithPartitionFileInfo(CLUSTER, metaClient.getTableType(), commitTime, partitions, filesPerPartition); + this.currentInstantTime = commitTime; + Map>> partitionToReplaceFileIdsWithLength = new HashMap<>(); + for (Map.Entry> entry : partitionToReplaceFileIds.entrySet()) { + String partition = entry.getKey(); + partitionToReplaceFileIdsWithLength.put(entry.getKey(), new ArrayList<>()); + for (String fileId : entry.getValue()) { + int length = 100 + RANDOM.nextInt(500); + partitionToReplaceFileIdsWithLength.get(partition).add(Pair.of(fileId, length)); + } + } + List writeStats = generateHoodieWriteStatForPartition(testTableState.getPartitionToBaseFileInfoMap(commitTime), commitTime, false); + for (String partition : testTableState.getPartitionToBaseFileInfoMap(commitTime).keySet()) { + this.withBaseFilesInPartition(partition, testTableState.getPartitionToBaseFileInfoMap(commitTime).get(partition)); + } + HoodieReplaceCommitMetadata replaceMetadata = + (HoodieReplaceCommitMetadata) buildMetadata(writeStats, partitionToReplaceFileIds, Option.empty(), CLUSTER, EMPTY_STRING, + REPLACE_COMMIT_ACTION); + addReplaceCommit(commitTime, Option.empty(), Option.empty(), replaceMetadata); + return replaceMetadata; + } + + public HoodieCleanMetadata doClean(String commitTime, Map partitionFileCountsToDelete) throws IOException { + Map> partitionFilesToDelete = new HashMap<>(); + for (Map.Entry entry : partitionFileCountsToDelete.entrySet()) { + partitionFilesToDelete.put(entry.getKey(), getEarliestFilesInPartition(entry.getKey(), entry.getValue())); + } + HoodieTestTableState testTableState = new HoodieTestTableState(); + for (Map.Entry> entry : partitionFilesToDelete.entrySet()) { + testTableState = testTableState.createTestTableStateForCleaner(commitTime, entry.getKey(), entry.getValue()); + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + Pair cleanerMeta = getHoodieCleanMetadata(commitTime, testTableState); + addClean(commitTime, cleanerMeta.getKey(), cleanerMeta.getValue()); + return cleanerMeta.getValue(); + } + + public HoodieCleanMetadata doCleanBasedOnCommits(String cleanCommitTime, List commitsToClean) throws IOException { + Map partitionFileCountsToDelete = new HashMap<>(); + for (String commitTime : commitsToClean) { + Option commitMetadata = getMetadataForInstant(commitTime); + if (commitMetadata.isPresent()) { + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + for (String partition : partitionFiles.keySet()) { + partitionFileCountsToDelete.put(partition, partitionFiles.get(partition).size() + partitionFileCountsToDelete.getOrDefault(partition, 0)); + } + } + } + return doClean(cleanCommitTime, partitionFileCountsToDelete); + } + + public HoodieSavepointMetadata doSavepoint(String commitTime) throws IOException { + Option commitMetadata = getMetadataForInstant(commitTime); + if (!commitMetadata.isPresent()) { + throw new IllegalArgumentException("Instant to rollback not present in timeline: " + commitTime); + } + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + HoodieSavepointMetadata savepointMetadata = getSavepointMetadata(commitTime, partitionFiles); + for (Map.Entry> entry : partitionFiles.entrySet()) { + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + return savepointMetadata; + } + + public HoodieCommitMetadata doCompaction(String commitTime, List partitions) throws Exception { + return doCompaction(commitTime, partitions, false); + } + + public HoodieCommitMetadata doCompaction(String commitTime, List partitions, boolean inflight) throws Exception { + this.currentInstantTime = commitTime; + if (partitions.isEmpty()) { + partitions = Collections.singletonList(EMPTY_STRING); + } + HoodieTestTableState testTableState = getTestTableStateWithPartitionFileInfo(COMPACT, metaClient.getTableType(), commitTime, partitions, 1); + HoodieCommitMetadata commitMetadata = createCommitMetadata(COMPACT, commitTime, testTableState); + for (String partition : partitions) { + this.withBaseFilesInPartition(partition, testTableState.getPartitionToBaseFileInfoMap(commitTime).get(partition)); + } + if (inflight) { + this.addInflightCompaction(commitTime, commitMetadata); + } else { + this.addCompaction(commitTime, commitMetadata); + } + return commitMetadata; + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List partitions, int filesPerPartition) throws Exception { + return doWriteOperation(commitTime, operationType, Collections.emptyList(), partitions, filesPerPartition, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, List partitions, + int filesPerPartition) throws Exception { + return doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitions, filesPerPartition, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, List partitions, + int filesPerPartition, boolean bootstrap) throws Exception { + return doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitions, filesPerPartition, bootstrap, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List partitions, int filesPerPartition, boolean bootstrap) throws Exception { + return doWriteOperation(commitTime, operationType, Collections.emptyList(), partitions, filesPerPartition, + bootstrap, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, List partitions, + int filesPerPartition, boolean bootstrap, + boolean createInflightCommit) throws Exception { + if (partitions.isEmpty()) { + partitions = Collections.singletonList(EMPTY_STRING); + } + + Map>> partitionToFilesNameLengthMap = getPartitionFiles(partitions, + filesPerPartition); + return doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitionToFilesNameLengthMap, bootstrap, + createInflightCommit); + } + + /** + * Add commits to the requested partitions. + * + * @param commitTime - Commit time for the operation + * @param operationType - Operation type + * @param newPartitionsToAdd - New partitions to add for the operation + * @param partitionToFilesNameLengthMap - Map of partition names to its list of files name and length pair + * @param bootstrap - Whether bootstrapping needed for the operation + * @param createInflightCommit - Whether in flight commit needed for the operation + * @return Commit metadata for the commit operation performed. + * @throws Exception + */ + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, + Map>> partitionToFilesNameLengthMap, + boolean bootstrap, boolean createInflightCommit) throws Exception { + if (partitionToFilesNameLengthMap.isEmpty()) { + partitionToFilesNameLengthMap = Collections.singletonMap(EMPTY_STRING, Collections.EMPTY_LIST); + } + HoodieTestTableState testTableState = getTestTableStateWithPartitionFileInfo(operationType, + metaClient.getTableType(), commitTime, partitionToFilesNameLengthMap); + HoodieCommitMetadata commitMetadata = createCommitMetadata(operationType, commitTime, testTableState, bootstrap); + for (String str : newPartitionsToAdd) { + this.withPartitionMetaFiles(str); + } + if (createInflightCommit) { + if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + this.addInflightCommit(commitTime); + } else { + this.addInflightDeltaCommit(commitTime); + } + } else { + if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + this.addCommit(commitTime, Option.of(commitMetadata)); + } else { + this.addDeltaCommit(commitTime, commitMetadata); + } + } + for (Map.Entry>> entry : partitionToFilesNameLengthMap.entrySet()) { + String partition = entry.getKey(); + this.withBaseFilesInPartition(partition, testTableState.getPartitionToBaseFileInfoMap(commitTime).get(partition)); + if (MERGE_ON_READ.equals(metaClient.getTableType()) && UPSERT.equals(operationType)) { + this.withLogFilesInPartition(partition, testTableState.getPartitionToLogFileInfoMap(commitTime).get(partition)); + } + } + return commitMetadata; + } + + private Option getMetadataForInstant(String instantTime) { + metaClient = HoodieTableMetaClient.reload(metaClient); + Option hoodieInstant = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().filter(i -> i.getTimestamp().equals(instantTime)).firstInstant(); + try { + if (hoodieInstant.isPresent()) { + return getCommitMeta(hoodieInstant.get()); + } else { + return Option.empty(); + } + } catch (IOException io) { + throw new HoodieIOException("Unable to read metadata for instant " + hoodieInstant.get(), io); + } + } + + private Option getCommitMeta(HoodieInstant hoodieInstant) throws IOException { + switch (hoodieInstant.getAction()) { + case HoodieTimeline.REPLACE_COMMIT_ACTION: + HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieReplaceCommitMetadata.class); + return Option.of(replaceCommitMetadata); + case HoodieTimeline.DELTA_COMMIT_ACTION: + case HoodieTimeline.COMMIT_ACTION: + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieCommitMetadata.class); + return Option.of(commitMetadata); + default: + throw new IllegalArgumentException("Unknown instant action" + hoodieInstant.getAction()); + } + } + + private static Map> getPartitionFiles(HoodieCommitMetadata commitMetadata) { + Map> partitionFilesToDelete = new HashMap<>(); + Map> partitionToWriteStats = commitMetadata.getPartitionToWriteStats(); + for (Map.Entry> entry : partitionToWriteStats.entrySet()) { + partitionFilesToDelete.put(entry.getKey(), new ArrayList<>()); + entry.getValue().forEach(writeStat -> partitionFilesToDelete.get(entry.getKey()).add(writeStat.getFileId())); + } + return partitionFilesToDelete; + } + + /** + * Generate partition files names and length details. + * + * @param partitions - List of partition for which file details need to be generated + * @param filesPerPartition - File count per partition + * @return Map of partition to its collection of files name and length pair + */ + protected static Map>> getPartitionFiles(List partitions, + int filesPerPartition) { + Map>> partitionToFilesNameLengthMap = new HashMap<>(); + for (String partition : partitions) { + Stream fileLengths = IntStream.range(0, filesPerPartition).map(i -> 100 + RANDOM.nextInt(500)).boxed(); + List> fileNameAndLengthList = + fileLengths.map(len -> Pair.of(UUID.randomUUID().toString(), len)).collect(Collectors.toList()); + partitionToFilesNameLengthMap.put(partition, fileNameAndLengthList); + } + return partitionToFilesNameLengthMap; + } + + /** + * Get Test table state for the requested partitions and file count. + * + * @param operationType - Table write operation type + * @param tableType - Hudi table type + * @param commitTime - Write commit time + * @param partitions - List of partition names + * @param filesPerPartition - Total file count per partition + * @return Test table state for the requested partitions and file count + */ + private static HoodieTestTableState getTestTableStateWithPartitionFileInfo(WriteOperationType operationType, + HoodieTableType tableType, + String commitTime, + List partitions, + int filesPerPartition) { + Map>> partitionToFilesNameLengthMap = getPartitionFiles(partitions, + filesPerPartition); + return getTestTableStateWithPartitionFileInfo(operationType, tableType, commitTime, partitionToFilesNameLengthMap); + } + + /** + * Get Test table state for the requested partitions and files. + * + * @param operationType - Table write operation type + * @param tableType - Hudi table type + * @param commitTime - Write commit time + * @param partitionToFilesNameLengthMap - Map of partition names to its list of files and their lengths + * @return Test tabke state for the requested partitions and files + */ + private static HoodieTestTableState getTestTableStateWithPartitionFileInfo(WriteOperationType operationType, + HoodieTableType tableType, + String commitTime, + Map>> partitionToFilesNameLengthMap) { + for (Map.Entry>> partitionEntry : partitionToFilesNameLengthMap.entrySet()) { + String partitionName = partitionEntry.getKey(); + List> fileNameAndLengthList = partitionEntry.getValue(); + if (MERGE_ON_READ.equals(tableType) && UPSERT.equals(operationType)) { + List> fileVersionAndLength = + fileNameAndLengthList.stream().map(nameLengthPair -> Pair.of(0, nameLengthPair.getRight())).collect(Collectors.toList()); + testTableState = testTableState.createTestTableStateForBaseAndLogFiles(commitTime, partitionName, + fileVersionAndLength); + } else { + testTableState = testTableState.createTestTableStateForBaseFilesOnly(commitTime, partitionName, + fileNameAndLengthList); + } + } + return testTableState; + } + + private static List generateHoodieWriteStatForPartition(Map>> partitionToFileIdMap, + String commitTime, boolean bootstrap) { + List writeStats = new ArrayList<>(); + for (Map.Entry>> entry : partitionToFileIdMap.entrySet()) { + String partition = entry.getKey(); + for (Pair fileIdInfo : entry.getValue()) { + HoodieWriteStat writeStat = new HoodieWriteStat(); + String fileName = bootstrap ? fileIdInfo.getKey() : + FileCreateUtils.baseFileName(commitTime, fileIdInfo.getKey()); + writeStat.setFileId(fileName); + writeStat.setPartitionPath(partition); + writeStat.setPath(partition + "/" + fileName); + writeStat.setTotalWriteBytes(fileIdInfo.getValue()); + writeStats.add(writeStat); + } + } + return writeStats; + } + + /** + * Returns the write stats for log files in the partition. Since log file has version associated with it, the {@param partitionToFileIdMap} + * contains list of Pair where the Integer[] array has both file version and file size. + */ + private static List generateHoodieWriteStatForPartitionLogFiles(Map>> partitionToFileIdMap, String commitTime, boolean bootstrap) { + List writeStats = new ArrayList<>(); + if (partitionToFileIdMap == null) { + return writeStats; + } + for (Map.Entry>> entry : partitionToFileIdMap.entrySet()) { + String partition = entry.getKey(); + for (Pair fileIdInfo : entry.getValue()) { + HoodieWriteStat writeStat = new HoodieWriteStat(); + String fileName = bootstrap ? fileIdInfo.getKey() : + FileCreateUtils.logFileName(commitTime, fileIdInfo.getKey(), fileIdInfo.getValue()[0]); + writeStat.setFileId(fileName); + writeStat.setPartitionPath(partition); + writeStat.setPath(partition + "/" + fileName); + writeStat.setTotalWriteBytes(fileIdInfo.getValue()[1]); + writeStats.add(writeStat); + } + } + return writeStats; + } + public static class HoodieTestTableException extends RuntimeException { public HoodieTestTableException(Throwable t) { super(t); } } + + static class HoodieTestTableState { + /** + * Map>> + * Used in building CLEAN metadata. + */ + Map>> commitsToPartitionToFileIdForCleaner = new HashMap<>(); + /** + * Map>>> + * Used to build commit metadata for base files for several write operations. + */ + Map>>> commitsToPartitionToBaseFileInfoStats = new HashMap<>(); + /** + * Map>>> + * Used to build commit metadata for log files for several write operations. + */ + Map>>> commitsToPartitionToLogFileInfoStats = new HashMap<>(); + + HoodieTestTableState() { + } + + static HoodieTestTableState of() { + return new HoodieTestTableState(); + } + + HoodieTestTableState createTestTableStateForCleaner(String commitTime, String partitionPath, List filesToClean) { + if (!commitsToPartitionToFileIdForCleaner.containsKey(commitTime)) { + commitsToPartitionToFileIdForCleaner.put(commitTime, new HashMap<>()); + } + if (!this.commitsToPartitionToFileIdForCleaner.get(commitTime).containsKey(partitionPath)) { + this.commitsToPartitionToFileIdForCleaner.get(commitTime).put(partitionPath, new ArrayList<>()); + } + + this.commitsToPartitionToFileIdForCleaner.get(commitTime).get(partitionPath).addAll(filesToClean); + return this; + } + + Map> getPartitionToFileIdMapForCleaner(String commitTime) { + return this.commitsToPartitionToFileIdForCleaner.get(commitTime); + } + + HoodieTestTableState createTestTableStateForBaseFileLengthsOnly(String commitTime, String partitionPath, + List lengths) { + List> fileNameLengthList = new ArrayList<>(); + for (int length : lengths) { + fileNameLengthList.add(Pair.of(UUID.randomUUID().toString(), length)); + } + return createTestTableStateForBaseFilesOnly(commitTime, partitionPath, fileNameLengthList); + } + + HoodieTestTableState createTestTableStateForBaseFilesOnly(String commitTime, String partitionPath, + List> fileNameAndLengthList) { + if (!commitsToPartitionToBaseFileInfoStats.containsKey(commitTime)) { + commitsToPartitionToBaseFileInfoStats.put(commitTime, new HashMap<>()); + } + if (!this.commitsToPartitionToBaseFileInfoStats.get(commitTime).containsKey(partitionPath)) { + this.commitsToPartitionToBaseFileInfoStats.get(commitTime).put(partitionPath, new ArrayList<>()); + } + + this.commitsToPartitionToBaseFileInfoStats.get(commitTime).get(partitionPath).addAll(fileNameAndLengthList); + return this; + } + + HoodieTestTableState createTestTableStateForBaseAndLogFiles(String commitTime, String partitionPath, + List> versionsAndLengths) { + if (!commitsToPartitionToBaseFileInfoStats.containsKey(commitTime)) { + createTestTableStateForBaseFileLengthsOnly(commitTime, partitionPath, + versionsAndLengths.stream().map(Pair::getRight).collect(Collectors.toList())); + } + if (!this.commitsToPartitionToBaseFileInfoStats.get(commitTime).containsKey(partitionPath)) { + createTestTableStateForBaseFileLengthsOnly(commitTime, partitionPath, + versionsAndLengths.stream().map(Pair::getRight).collect(Collectors.toList())); + } + if (!commitsToPartitionToLogFileInfoStats.containsKey(commitTime)) { + commitsToPartitionToLogFileInfoStats.put(commitTime, new HashMap<>()); + } + if (!this.commitsToPartitionToLogFileInfoStats.get(commitTime).containsKey(partitionPath)) { + this.commitsToPartitionToLogFileInfoStats.get(commitTime).put(partitionPath, new ArrayList<>()); + } + + List> fileInfos = new ArrayList<>(); + for (int i = 0; i < versionsAndLengths.size(); i++) { + Pair versionAndLength = versionsAndLengths.get(i); + String fileId = FSUtils.getFileId(commitsToPartitionToBaseFileInfoStats.get(commitTime).get(partitionPath).get(i).getLeft()); + fileInfos.add(Pair.of(fileId, new Integer[] {versionAndLength.getLeft(), versionAndLength.getRight()})); + } + this.commitsToPartitionToLogFileInfoStats.get(commitTime).get(partitionPath).addAll(fileInfos); + return this; + } + + Map>> getPartitionToBaseFileInfoMap(String commitTime) { + return this.commitsToPartitionToBaseFileInfoStats.get(commitTime); + } + + Map>> getPartitionToLogFileInfoMap(String commitTime) { + return this.commitsToPartitionToLogFileInfoStats.get(commitTime); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index bc1c18a7913be..d03dca0c81887 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -63,9 +63,10 @@ public static HoodieTableMetaClient init(String basePath, HoodieTableType tableT return init(getDefaultHadoopConf(), basePath, tableType); } - public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath) throws IOException { + public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath, boolean bootstrapIndexEnable) throws IOException { Properties props = new Properties(); props.setProperty(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key(), bootstrapBasePath); + props.put(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE.key(), bootstrapIndexEnable); return init(getDefaultHadoopConf(), basePath, tableType, props); } @@ -110,8 +111,15 @@ public static HoodieTableMetaClient init(Configuration hadoopConf, String basePa return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties); } + public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath, HoodieFileFormat baseFileFormat) throws IOException { + Properties props = new Properties(); + props.setProperty(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key(), bootstrapBasePath); + props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), baseFileFormat.name()); + return init(getDefaultHadoopConf(), basePath, tableType, props); + } + public static T serializeDeserialize(T object, Class clazz) { - // Using Kyro as the default serializer in Spark Jobs + // Using Kryo as the default serializer in Spark Jobs Kryo kryo = new Kryo(); kryo.register(HoodieTableMetaClient.class, new JavaSerializer()); @@ -121,9 +129,9 @@ public static T serializeDeserialize(T object, Class output.close(); Input input = new Input(new ByteArrayInputStream(baos.toByteArray())); - T deseralizedObject = kryo.readObject(input, clazz); + T deserializedObject = kryo.readObject(input, clazz); input.close(); - return deseralizedObject; + return deserializedObject; } public static List generateFakeHoodieWriteStat(int limit) { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java index 799ed248b1d8a..37fead4928b43 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java @@ -18,7 +18,7 @@ package org.apache.hudi.common.util; -import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieException; import org.apache.parquet.hadoop.ParquetReader; import org.junit.jupiter.api.Test; @@ -30,6 +30,8 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; public class TestParquetReaderIterator { @@ -59,6 +61,7 @@ public void testParquetIterator() throws IOException { assertEquals(1, iterator.next()); // no more entries to iterate on assertFalse(iterator.hasNext()); - assertThrows(HoodieIOException.class, iterator::next, "should throw an exception since there is only 1 record"); + assertThrows(HoodieException.class, iterator::next, "should throw an exception since there is only 1 record"); + verify(reader, times(1)).close(); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java new file mode 100644 index 0000000000000..de0424f42580a --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import javax.xml.bind.DatatypeConverter; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHashID { + + /** + * Test HashID of all sizes for ByteArray type input message. + */ + @ParameterizedTest + @EnumSource(HashID.Size.class) + public void testHashForByteInput(HashID.Size size) { + final int count = 8; + Random random = new Random(); + for (int i = 0; i < count; i++) { + final String message = random.ints(50, 120) + .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97)) + .limit((32 + (i * 4))) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); + final byte[] originalData = message.getBytes(StandardCharsets.UTF_8); + final byte[] hashBytes = HashID.hash(originalData, size); + assertEquals(hashBytes.length, size.byteSize()); + } + } + + /** + * Test HashID of all sizes for String type input message. + */ + @ParameterizedTest + @EnumSource(HashID.Size.class) + public void testHashForStringInput(HashID.Size size) { + final int count = 8; + Random random = new Random(); + for (int i = 0; i < count; i++) { + final String message = random.ints(50, 120) + .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97)) + .limit((32 + (i * 4))) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); + final byte[] hashBytes = HashID.hash(message, size); + assertEquals(hashBytes.length, size.byteSize()); + } + } + + /** + * Test expected hash values for all bit sizes. + */ + @Test + public void testHashValues() { + Map> expectedValuesMap = new HashMap>(); + Map hash32ExpectedValues = new HashMap() { + { + put("Hudi", "FB6A3F92"); + put("Data lake", "99913A4D"); + put("Data Lake", "6F7DAD6A"); + put("Col1", "B4393B9A"); + put("A", "CDD946CE"); + put("2021/10/28/", "BBD4FDB2"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_32, hash32ExpectedValues); + + Map hash64ExpectedValues = new HashMap() { + { + put("Hudi", "F7727B9A28379071"); + put("Data lake", "52BC72D592EBCAE5"); + put("Data Lake", "5ED19AF9FD746E3E"); + put("Col1", "22FB1DD2F4784D31"); + put("A", "EBF88350484B5AA7"); + put("2021/10/28/", "2A9399AF6E7C8B12"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_128, hash64ExpectedValues); + + Map hash128ExpectedValues = new HashMap() { + { + put("Hudi", "09DAB749F255311C1C9EF6DD7B790170"); + put("Data lake", "7F2FC1EA445FC81F67CAA25EC9089C08"); + put("Data Lake", "9D2CEF0D61B02848C528A070ED75C570"); + put("Col1", "EC0FFE21E704DE2A580661C59A81D453"); + put("A", "7FC56270E7A70FA81A5935B72EACBE29"); + put("2021/10/28/", "1BAE8F04F44CB7ACF2458EF5219742DC"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_128, hash128ExpectedValues); + + for (Map.Entry> allSizeEntries : expectedValuesMap.entrySet()) { + for (Map.Entry sizeEntry : allSizeEntries.getValue().entrySet()) { + final byte[] actualHashBytes = HashID.hash(sizeEntry.getKey(), allSizeEntries.getKey()); + final byte[] expectedHashBytes = DatatypeConverter.parseHexBinary(sizeEntry.getValue()); + assertTrue(Arrays.equals(expectedHashBytes, actualHashBytes)); + } + } + } +} diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java b/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java index 4d06e4d15fa09..587f73b0f7fd4 100644 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java +++ b/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java @@ -96,7 +96,7 @@ public static void main(String[] args) throws Exception { List> recordsSoFar = new ArrayList<>(records); List> writeRecords = recordsSoFar.stream().map(r -> new HoodieRecord(r)).collect(Collectors.toList()); - client.upsert(writeRecords, newCommitTime); + client.insert(writeRecords, newCommitTime); // updates newCommitTime = client.startCommit(); diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieSparkBootstrapExample.java b/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieSparkBootstrapExample.java new file mode 100644 index 0000000000000..e385e476dd26e --- /dev/null +++ b/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieSparkBootstrapExample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.spark; + +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.config.HoodieBootstrapConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.examples.common.HoodieExampleSparkUtils; +import org.apache.hudi.keygen.NonpartitionedKeyGenerator; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.Dataset; + + + +public class HoodieSparkBootstrapExample { + + private static String tableType = HoodieTableType.MERGE_ON_READ.name(); + + + public static void main(String[] args) throws Exception { + if (args.length < 5) { + System.err.println("Usage: HoodieWriteClientExample "); + System.exit(1); + } + String recordKey = args[0]; + String tableName = args[1]; + String partitionPath = args[2]; + String preCombineField = args[3]; + String basePath = args[4]; + + SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example"); + + SparkSession spark = SparkSession + .builder() + .appName("Java Spark SQL basic example") + .config("spark.some.config.option", "some-value") + .enableHiveSupport() + .getOrCreate(); + + Dataset df = spark.emptyDataFrame(); + + df.write().format("hudi").option(HoodieWriteConfig.TBL_NAME.key(), tableName) + .option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL()) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey) + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPath) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineField) + .option(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieFileFormat.ORC.name()) + .option(HoodieBootstrapConfig.BASE_PATH.key(), basePath) + .option(HoodieBootstrapConfig.KEYGEN_CLASS_NAME.key(), NonpartitionedKeyGenerator.class.getCanonicalName()) + .mode(SaveMode.Overwrite).save("/hudi/"+tableName); + + df.count(); + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java index bdb58e424a67d..65a95ed7c2b01 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java +++ b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -100,7 +100,7 @@ private FlinkOptions() { public static final ConfigOption METADATA_COMPACTION_DELTA_COMMITS = ConfigOptions .key("metadata.compaction.delta_commits") .intType() - .defaultValue(24) + .defaultValue(10) .withDescription("Max delta commits for metadata table to trigger compaction, default 24"); // ------------------------------------------------------------------------ @@ -115,15 +115,15 @@ private FlinkOptions() { public static final ConfigOption INDEX_STATE_TTL = ConfigOptions .key("index.state.ttl") .doubleType() - .defaultValue(1.5D) - .withDescription("Index state ttl in days, default 1.5 day"); + .defaultValue(0D) + .withDescription("Index state ttl in days, default stores the index permanently"); public static final ConfigOption INDEX_GLOBAL_ENABLED = ConfigOptions .key("index.global.enabled") .booleanType() - .defaultValue(false) + .defaultValue(true) .withDescription("Whether to update index for the old partition path\n" - + "if same key record with different partition path came in, default false"); + + "if same key record with different partition path came in, default true"); public static final ConfigOption INDEX_PARTITION_REGEX = ConfigOptions .key("index.partition.regex") @@ -197,12 +197,18 @@ private FlinkOptions() { .withDescription("Check interval for streaming read of SECOND, default 1 minute"); public static final String START_COMMIT_EARLIEST = "earliest"; - public static final ConfigOption READ_STREAMING_START_COMMIT = ConfigOptions - .key("read.streaming.start-commit") + public static final ConfigOption READ_START_COMMIT = ConfigOptions + .key("read.start-commit") + .stringType() + .noDefaultValue() + .withDescription("Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', " + + "by default reading from the latest instant for streaming read"); + + public static final ConfigOption READ_END_COMMIT = ConfigOptions + .key("read.end-commit") .stringType() .noDefaultValue() - .withDescription("Start commit instant for streaming read, the commit time format should be 'yyyyMMddHHmmss', " - + "by default reading from the latest instant"); + .withDescription("End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'"); // ------------------------------------------------------------------------ // Write Options @@ -221,11 +227,13 @@ private FlinkOptions() { .defaultValue(TABLE_TYPE_COPY_ON_WRITE) .withDescription("Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ"); - public static final ConfigOption INSERT_DEDUP = ConfigOptions - .key("write.insert.deduplicate") + public static final ConfigOption INSERT_CLUSTER = ConfigOptions + .key("write.insert.cluster") .booleanType() - .defaultValue(true) - .withDescription("Whether to deduplicate for INSERT operation, if disabled, writes the base files directly, default true"); + .defaultValue(false) + .withDescription("Whether to merge small files for insert mode, " + + "if true, the write throughput will decrease because the read/write of existing small file, " + + "only valid for COW table, default false"); public static final ConfigOption OPERATION = ConfigOptions .key("write.operation") @@ -233,6 +241,7 @@ private FlinkOptions() { .defaultValue("upsert") .withDescription("The write operation, that this write should do"); + public static final String NO_PRE_COMBINE = "no_precombine"; public static final ConfigOption PRECOMBINE_FIELD = ConfigOptions .key("write.precombine.field") .stringType() @@ -249,15 +258,17 @@ private FlinkOptions() { + "This will render any value set for the option in-effective"); /** - * Flag to indicate whether to drop duplicates upon insert. - * By default insert will accept duplicates, to gain extra performance. + * Flag to indicate whether to drop duplicates before insert/upsert. + * By default false to gain extra performance. */ - public static final ConfigOption INSERT_DROP_DUPS = ConfigOptions - .key("write.insert.drop.duplicates") + public static final ConfigOption PRE_COMBINE = ConfigOptions + .key("write.precombine") .booleanType() .defaultValue(false) - .withDescription("Flag to indicate whether to drop duplicates upon insert.\n" - + "By default insert will accept duplicates, to gain extra performance"); + .withDescription("Flag to indicate whether to drop duplicates before insert/upsert.\n" + + "By default these cases will accept duplicates, to gain extra performance:\n" + + "1) insert operation;\n" + + "2) upsert for MOR table, the MOR table deduplicate on reading"); public static final ConfigOption RETRY_TIMES = ConfigOptions .key("write.retry.times") @@ -335,19 +346,19 @@ private FlinkOptions() { .key("write.index_bootstrap.tasks") .intType() .noDefaultValue() - .withDescription("Parallelism of tasks that do index bootstrap, default is the parallelism of the environment"); + .withDescription("Parallelism of tasks that do index bootstrap, default is the parallelism of the execution environment"); public static final ConfigOption BUCKET_ASSIGN_TASKS = ConfigOptions .key("write.bucket_assign.tasks") .intType() .noDefaultValue() - .withDescription("Parallelism of tasks that do bucket assign, default is the parallelism of the environment"); + .withDescription("Parallelism of tasks that do bucket assign, default is the parallelism of the execution environment"); public static final ConfigOption WRITE_TASKS = ConfigOptions .key("write.tasks") .intType() - .noDefaultValue() - .withDescription("Parallelism of tasks that do actual write, default is the parallelism of the environment"); + .defaultValue(4) + .withDescription("Parallelism of tasks that do actual write, default is 4"); public static final ConfigOption WRITE_TASK_MAX_SIZE = ConfigOptions .key("write.task.max.size") @@ -365,8 +376,8 @@ private FlinkOptions() { public static final ConfigOption WRITE_BATCH_SIZE = ConfigOptions .key("write.batch.size") .doubleType() - .defaultValue(64D) // 64MB - .withDescription("Batch buffer size in MB to flush data into the underneath filesystem, default 64MB"); + .defaultValue(256D) // 256MB + .withDescription("Batch buffer size in MB to flush data into the underneath filesystem, default 256MB"); public static final ConfigOption WRITE_LOG_BLOCK_SIZE = ConfigOptions .key("write.log_block.size") @@ -452,8 +463,8 @@ private FlinkOptions() { public static final ConfigOption COMPACTION_TASKS = ConfigOptions .key("compaction.tasks") .intType() - .defaultValue(10) // default WRITE_TASKS * COMPACTION_DELTA_COMMITS * 0.5 (assumes two commits generate one bucket) - .withDescription("Parallelism of tasks that do actual compaction, default is 10"); + .defaultValue(4) // default WRITE_TASKS * COMPACTION_DELTA_COMMITS * 0.2 (assumes 5 commits generate one bucket) + .withDescription("Parallelism of tasks that do actual compaction, default is 4"); public static final String NUM_COMMITS = "num_commits"; public static final String TIME_ELAPSED = "time_elapsed"; @@ -481,6 +492,12 @@ private FlinkOptions() { .defaultValue(3600) // default 1 hour .withDescription("Max delta seconds time needed to trigger compaction, default 1 hour"); + public static final ConfigOption COMPACTION_TIMEOUT_SECONDS = ConfigOptions + .key("compaction.timeout.seconds") + .intType() + .defaultValue(1200) // default 20 minutes + .withDescription("Max timeout time in seconds for online compaction to rollback, default 20 minutes"); + public static final ConfigOption COMPACTION_MAX_MEMORY = ConfigOptions .key("compaction.max_memory") .intType() @@ -490,8 +507,8 @@ private FlinkOptions() { public static final ConfigOption COMPACTION_TARGET_IO = ConfigOptions .key("compaction.target_io") .longType() - .defaultValue(5120L) // default 5 GB - .withDescription("Target IO per compaction (both read and write), default 5 GB"); + .defaultValue(500 * 1024L) // default 500 GB + .withDescription("Target IO per compaction (both read and write), default 500 GB"); public static final ConfigOption CLEAN_ASYNC_ENABLED = ConfigOptions .key("clean.async.enabled") diff --git a/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java b/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java new file mode 100644 index 0000000000000..075736fe38a17 --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.configuration; + +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; + +import org.apache.flink.configuration.Configuration; + +import java.util.Locale; + +/** + * Tool helping to resolve the flink options {@link FlinkOptions}. + */ +public class OptionsResolver { + /** + * Returns whether insert clustering is allowed with given configuration {@code conf}. + */ + public static boolean insertClustering(Configuration conf) { + return isCowTable(conf) && isInsertOperation(conf) && conf.getBoolean(FlinkOptions.INSERT_CLUSTER); + } + + /** + * Returns whether the insert is clustering disabled with given configuration {@code conf}. + */ + public static boolean isAppendMode(Configuration conf) { + return isCowTable(conf) && isInsertOperation(conf) && !conf.getBoolean(FlinkOptions.INSERT_CLUSTER); + } + + /** + * Returns whether the table operation is 'insert'. + */ + public static boolean isInsertOperation(Configuration conf) { + WriteOperationType operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)); + return operationType == WriteOperationType.INSERT; + } + + /** + * Returns whether it is a MERGE_ON_READ table. + */ + public static boolean isMorTable(Configuration conf) { + return conf.getString(FlinkOptions.TABLE_TYPE) + .toUpperCase(Locale.ROOT) + .equals(FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + } + + /** + * Returns whether it is a COPY_ON_WRITE table. + */ + public static boolean isCowTable(Configuration conf) { + return conf.getString(FlinkOptions.TABLE_TYPE) + .toUpperCase(Locale.ROOT) + .equals(FlinkOptions.TABLE_TYPE_COPY_ON_WRITE); + } + + /** + * Returns whether the payload clazz is {@link DefaultHoodieRecordPayload}. + */ + public static boolean isDefaultHoodieRecordPayloadClazz(Configuration conf) { + return conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME).contains(DefaultHoodieRecordPayload.class.getSimpleName()); + } + + /** + * Returns the preCombine field + * or null if the value is set as {@link FlinkOptions#NO_PRE_COMBINE}. + */ + public static String getPreCombineField(Configuration conf) { + final String preCombineField = conf.getString(FlinkOptions.PRECOMBINE_FIELD); + return preCombineField.equals(FlinkOptions.NO_PRE_COMBINE) ? null : preCombineField; + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java b/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java new file mode 100644 index 0000000000000..c302c1db0d133 --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.schema; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.util.StreamerUtil; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.avro.Schema; + +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Base64; +import java.util.Collections; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Obtains latest schema from the Confluent/Kafka schema-registry. + *

+ * https://github.com/confluentinc/schema-registry + */ +public class SchemaRegistryProvider extends SchemaProvider { + + private final TypedProperties config; + + + /** + * Configs supported. + */ + public static class Config { + + private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; + private static final String TARGET_SCHEMA_REGISTRY_URL_PROP = + "hoodie.deltastreamer.schemaprovider.registry.targetUrl"; + } + + /** + * The method takes the provided url {@code registryUrl} and gets the schema from the schema registry using that url. + * If the caller provides userInfo credentials in the url (e.g "https://foo:bar@schemaregistry.org") then the credentials + * are extracted the url using the Matcher and the extracted credentials are set on the request as an Authorization + * header. + * + * @param registryUrl + * @return the Schema in String form. + * @throws IOException + */ + public String fetchSchemaFromRegistry(String registryUrl) throws IOException { + URL registry; + HttpURLConnection connection; + Matcher matcher = Pattern.compile("://(.*?)@").matcher(registryUrl); + if (matcher.find()) { + String creds = matcher.group(1); + String urlWithoutCreds = registryUrl.replace(creds + "@", ""); + registry = new URL(urlWithoutCreds); + connection = (HttpURLConnection) registry.openConnection(); + setAuthorizationHeader(matcher.group(1), connection); + } else { + registry = new URL(registryUrl); + connection = (HttpURLConnection) registry.openConnection(); + } + ObjectMapper mapper = new ObjectMapper(); + JsonNode node = mapper.readTree(getStream(connection)); + return node.get("schema").asText(); + } + + protected void setAuthorizationHeader(String creds, HttpURLConnection connection) { + String encodedAuth = Base64.getEncoder().encodeToString(creds.getBytes(StandardCharsets.UTF_8)); + connection.setRequestProperty("Authorization", "Basic " + encodedAuth); + } + + protected InputStream getStream(HttpURLConnection connection) throws IOException { + return connection.getInputStream(); + } + + public SchemaRegistryProvider(TypedProperties props) { + this.config = props; + StreamerUtil.checkRequiredProperties(props, Collections.singletonList(Config.SRC_SCHEMA_REGISTRY_URL_PROP)); + } + + private Schema getSchema(String registryUrl) throws IOException { + return new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl)); + } + + @Override + public Schema getSourceSchema() { + String registryUrl = config.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP); + try { + return getSchema(registryUrl); + } catch (IOException ioe) { + throw new HoodieIOException("Error reading source schema from registry :" + registryUrl, ioe); + } + } + + @Override + public Schema getTargetSchema() { + String registryUrl = config.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP); + String targetRegistryUrl = config.getString(Config.TARGET_SCHEMA_REGISTRY_URL_PROP, registryUrl); + try { + return getSchema(targetRegistryUrl); + } catch (IOException ioe) { + throw new HoodieIOException("Error reading target schema from registry :" + registryUrl, ioe); + } + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/CleanFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/CleanFunction.java index 1ca593ff53501..26ac9f3adf6f3 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/CleanFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/CleanFunction.java @@ -47,6 +47,7 @@ public class CleanFunction extends AbstractRichFunction private final Configuration conf; protected HoodieFlinkWriteClient writeClient; + private NonThrownExecutor executor; private volatile boolean isCleaning; @@ -60,7 +61,7 @@ public void open(Configuration parameters) throws Exception { super.open(parameters); if (conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) { this.writeClient = StreamerUtil.createWriteClient(conf, getRuntimeContext()); - this.executor = new NonThrownExecutor(LOG); + this.executor = NonThrownExecutor.builder(LOG).build(); } } @@ -81,8 +82,13 @@ public void notifyCheckpointComplete(long l) throws Exception { @Override public void snapshotState(FunctionSnapshotContext context) throws Exception { if (conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED) && !isCleaning) { - this.writeClient.startAsyncCleaning(); - this.isCleaning = true; + try { + this.writeClient.startAsyncCleaning(); + this.isCleaning = true; + } catch (Throwable throwable) { + // catch the exception to not affect the normal checkpointing + LOG.warn("Error while start async cleaning", throwable); + } } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java index a155fb52d8852..0e7e35e7ea328 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java @@ -67,7 +67,7 @@ *

The Semantics

* *

The task implements exactly-once semantics by buffering the data between checkpoints. The operator coordinator - * starts a new instant on the time line when a checkpoint triggers, the coordinator checkpoints always + * starts a new instant on the timeline when a checkpoint triggers, the coordinator checkpoints always * start before its operator, so when this function starts a checkpoint, a REQUESTED instant already exists. * *

The function process thread blocks data buffering after the checkpoint thread finishes flushing the existing data buffer until @@ -155,7 +155,6 @@ public void endInput() { // ------------------------------------------------------------------------- // Getter/Setter // ------------------------------------------------------------------------- - @VisibleForTesting @SuppressWarnings("rawtypes") public Map> getDataBuffer() { @@ -378,6 +377,8 @@ private void bufferRecord(HoodieRecord value) { k -> new DataBucket(this.config.getDouble(FlinkOptions.WRITE_BATCH_SIZE), value)); final DataItem item = DataItem.fromHoodieRecord(value); + bucket.records.add(item); + boolean flushBucket = bucket.detector.detect(item); boolean flushBuffer = this.tracer.trace(bucket.detector.lastRecordSize); if (flushBucket) { @@ -398,7 +399,6 @@ private void bufferRecord(HoodieRecord value) { LOG.warn("The buffer size hits the threshold {}, but still flush the max size data bucket failed!", this.tracer.maxBufferSize); } } - bucket.records.add(item); } private boolean hasData() { @@ -418,7 +418,7 @@ private boolean flushBucket(DataBucket bucket) { List records = bucket.writeBuffer(); ValidationUtils.checkState(records.size() > 0, "Data bucket to flush has no buffering records"); - if (config.getBoolean(FlinkOptions.INSERT_DROP_DUPS)) { + if (config.getBoolean(FlinkOptions.PRE_COMBINE)) { records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1); } bucket.preWrite(records); @@ -453,7 +453,7 @@ private void flushRemaining(boolean endInput) { .forEach(bucket -> { List records = bucket.writeBuffer(); if (records.size() > 0) { - if (config.getBoolean(FlinkOptions.INSERT_DROP_DUPS)) { + if (config.getBoolean(FlinkOptions.PRE_COMBINE)) { records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1); } bucket.preWrite(records); diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java index 3049947f8a244..0af38c41fbc5d 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java @@ -22,6 +22,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.CommitUtils; @@ -31,7 +32,6 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.sink.event.CommitAckEvent; import org.apache.hudi.sink.event.WriteMetadataEvent; -import org.apache.hudi.sink.utils.CoordinatorExecutor; import org.apache.hudi.sink.utils.HiveSyncContext; import org.apache.hudi.sink.utils.NonThrownExecutor; import org.apache.hudi.util.StreamerUtil; @@ -41,6 +41,7 @@ import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.TaskNotRunningException; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -57,7 +58,6 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.stream.Collectors; -import java.util.stream.IntStream; import static org.apache.hudi.util.StreamerUtil.initTableIfNotExists; @@ -95,6 +95,11 @@ public class StreamWriteOperatorCoordinator */ private transient HoodieFlinkWriteClient writeClient; + /** + * Meta client. + */ + private transient HoodieTableMetaClient metaClient; + /** * Current REQUESTED instant, for validation. */ @@ -114,7 +119,7 @@ public class StreamWriteOperatorCoordinator /** * A single-thread executor to handle all the asynchronous jobs of the coordinator. */ - private CoordinatorExecutor executor; + private NonThrownExecutor executor; /** * A single-thread executor to handle asynchronous hive sync. @@ -126,11 +131,6 @@ public class StreamWriteOperatorCoordinator */ private HiveSyncContext hiveSyncContext; - /** - * A single-thread executor to handle metadata table sync. - */ - private NonThrownExecutor metadataSyncExecutor; - /** * The table state. */ @@ -158,12 +158,15 @@ public void start() throws Exception { // initialize event buffer reset(); this.gateways = new SubtaskGateway[this.parallelism]; + // init table, create if not exists. + this.metaClient = initTableIfNotExists(this.conf); + // the write client must create after the table creation this.writeClient = StreamerUtil.createWriteClient(conf); this.tableState = TableState.create(conf); - // init table, create it if not exists. - initTableIfNotExists(this.conf); // start the executor - this.executor = new CoordinatorExecutor(this.context, LOG); + this.executor = NonThrownExecutor.builder(LOG) + .exceptionHook((errMsg, t) -> this.context.failJob(new HoodieException(errMsg, t))) + .waitForTasksFinish(true).build(); // start the executor if required if (tableState.syncHive) { initHiveSync(); @@ -176,15 +179,17 @@ public void start() throws Exception { @Override public void close() throws Exception { // teardown the resource - if (writeClient != null) { - writeClient.close(); - } if (executor != null) { executor.close(); } if (hiveSyncExecutor != null) { hiveSyncExecutor.close(); } + // the write client must close after the executor service + // because the task in the service may send requests to the embedded timeline service. + if (writeClient != null) { + writeClient.close(); + } this.eventBuffer = null; } @@ -215,18 +220,16 @@ public void notifyCheckpointComplete(long checkpointId) { // for streaming mode, commits the ever received events anyway, // the stream write task snapshot and flush the data buffer synchronously in sequence, // so a successful checkpoint subsumes the old one(follows the checkpoint subsuming contract) - final boolean committed = commitInstant(this.instant); + final boolean committed = commitInstant(this.instant, checkpointId); if (committed) { - // if async compaction is on, schedule the compaction if (tableState.scheduleCompaction) { + // if async compaction is on, schedule the compaction writeClient.scheduleCompaction(Option.empty()); } // start new instant. startInstant(); // sync Hive if is enabled syncHiveIfEnabled(); - // sync metadata if is enabled - syncMetadataIfEnabled(); } }, "commits the instant %s", this.instant ); @@ -278,7 +281,7 @@ public void subtaskReady(int i, SubtaskGateway subtaskGateway) { // ------------------------------------------------------------------------- private void initHiveSync() { - this.hiveSyncExecutor = new NonThrownExecutor(LOG, true); + this.hiveSyncExecutor = NonThrownExecutor.builder(LOG).waitForTasksFinish(true).build(); this.hiveSyncContext = HiveSyncContext.create(conf); } @@ -296,24 +299,7 @@ public void syncHive() { } private void initMetadataSync() { - this.metadataSyncExecutor = new NonThrownExecutor(LOG, true); - } - - /** - * Sync the write metadata to the metadata table. - */ - private void syncMetadataIfEnabled() { - if (tableState.syncMetadata) { - this.metadataSyncExecutor.execute(this::syncMetadata, - "sync metadata table for instant %s", this.instant); - } - } - - /** - * Sync the write metadata to the metadata table. - */ - private void syncMetadata() { - this.writeClient.syncTableMetadata(); + this.writeClient.initMetadataWriter(); } private void reset() { @@ -338,9 +324,11 @@ private void addEventToBuffer(WriteMetadataEvent event) { private void startInstant() { final String instant = HoodieActiveTimeline.createNewInstantTime(); - this.writeClient.startCommitWithTime(instant, tableState.commitAction); + // put the assignment in front of metadata generation, + // because the instant request from write task is asynchronous. this.instant = instant; - this.writeClient.transitionRequestedToInflight(tableState.commitAction, this.instant); + this.writeClient.startCommitWithTime(instant, tableState.commitAction); + this.metaClient.getActiveTimeline().transitionRequestedToInflight(tableState.commitAction, this.instant); this.writeClient.upgradeDowngrade(this.instant); LOG.info("Create instant [{}] for table [{}] with type [{}]", this.instant, this.conf.getString(FlinkOptions.TABLE_NAME), conf.getString(FlinkOptions.TABLE_TYPE)); @@ -366,11 +354,6 @@ private void initInstant(String instant) { LOG.info("Recommit instant {}", instant); commitInstant(instant); } - if (tableState.syncMetadata) { - // initialize metadata table first if enabled - // condition: the data set timeline has committed instants - syncMetadata(); - } // starts a new instant startInstant(); }, "initialize instant %s", instant); @@ -391,8 +374,6 @@ private void handleEndInputEvent(WriteMetadataEvent event) { commitInstant(this.instant); // sync Hive if is enabled in batch mode. syncHiveIfEnabled(); - // sync metadata if is enabled in batch mode. - syncMetadataIfEnabled(); } } @@ -412,23 +393,42 @@ private void handleWriteMetaEvent(WriteMetadataEvent event) { * The coordinator reuses the instant if there is no data for this round of checkpoint, * sends the commit ack events to unblock the flushing. */ - private void sendCommitAckEvents() { - CompletableFuture[] futures = IntStream.range(0, this.parallelism) - .mapToObj(taskID -> this.gateways[taskID].sendEvent(CommitAckEvent.getInstance())) + private void sendCommitAckEvents(long checkpointId) { + CompletableFuture[] futures = Arrays.stream(this.gateways).filter(Objects::nonNull) + .map(gw -> gw.sendEvent(CommitAckEvent.getInstance(checkpointId))) .toArray(CompletableFuture[]::new); try { CompletableFuture.allOf(futures).get(); - } catch (Exception e) { - throw new HoodieException("Error while waiting for the commit ack events to finish sending", e); + } catch (Throwable throwable) { + if (!sendToFinishedTasks(throwable)) { + throw new HoodieException("Error while waiting for the commit ack events to finish sending", throwable); + } } } + /** + * Decides whether the given exception is caused by sending events to FINISHED tasks. + * + *

Ugly impl: the exception may change in the future. + */ + private static boolean sendToFinishedTasks(Throwable throwable) { + return throwable.getCause() instanceof TaskNotRunningException + || throwable.getCause().getMessage().contains("running"); + } + + /** + * Commits the instant. + */ + private void commitInstant(String instant) { + commitInstant(instant, -1); + } + /** * Commits the instant. * * @return true if the write statuses are committed successfully. */ - private boolean commitInstant(String instant) { + private boolean commitInstant(String instant, long checkpointId) { if (Arrays.stream(eventBuffer).allMatch(Objects::isNull)) { // The last checkpoint finished successfully. return false; @@ -444,7 +444,7 @@ private boolean commitInstant(String instant) { // No data has written, reset the buffer and returns early reset(); // Send commit ack event to the write function to unblock the flushing - sendCommitAckEvents(); + sendCommitAckEvents(checkpointId); return false; } doCommit(instant, writeResults); @@ -505,33 +505,19 @@ public String getInstant() { return instant; } - @VisibleForTesting - @SuppressWarnings("rawtypes") - public HoodieFlinkWriteClient getWriteClient() { - return writeClient; - } - @VisibleForTesting public Context getContext() { return context; } @VisibleForTesting - public void setExecutor(CoordinatorExecutor executor) throws Exception { + public void setExecutor(NonThrownExecutor executor) throws Exception { if (this.executor != null) { this.executor.close(); } this.executor = executor; } - @VisibleForTesting - public void setMetadataSyncExecutor(NonThrownExecutor executor) throws Exception { - if (this.metadataSyncExecutor != null) { - this.metadataSyncExecutor.close(); - } - this.metadataSyncExecutor = executor; - } - // ------------------------------------------------------------------------- // Inner Class // ------------------------------------------------------------------------- diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java index 128c03010e6d7..090ed29b8e332 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java @@ -72,8 +72,6 @@ public void snapshotState() { // it would check the validity. // wait for the buffer data flush out and request a new instant flushData(false); - // nullify the write helper for next ckp - this.writerHelper = null; } @Override @@ -84,20 +82,11 @@ public void processElement(I value, Context ctx, Collector out) throws E this.writerHelper.write((RowData) value); } - @Override - public void close() { - if (this.writeClient != null) { - this.writeClient.cleanHandlesGracefully(); - this.writeClient.close(); - } - } - /** * End input action for batch source. */ public void endInput() { flushData(true); - this.writeClient.cleanHandles(); this.writeStatuses.clear(); } @@ -124,6 +113,10 @@ this.currentInstant, this.taskID, getRuntimeContext().getNumberOfParallelSubtask } private void flushData(boolean endInput) { + if (this.writerHelper == null) { + // does not process any inputs, returns early. + return; + } final List writeStatus = this.writerHelper.getWriteStatuses(this.taskID); final WriteMetadataEvent event = WriteMetadataEvent.builder() .taskID(taskID) @@ -133,5 +126,10 @@ private void flushData(boolean endInput) { .endInput(endInput) .build(); this.eventGateway.sendEventToCoordinator(event); + // nullify the write helper for next ckp + this.writerHelper = null; + this.writeStatuses.addAll(writeStatus); + // blocks flushing until the coordinator starts a new instant + this.confirming = true; } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java index 3ac7aa1e66c29..f6055ba11d2fc 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java @@ -18,31 +18,26 @@ package org.apache.hudi.sink.bootstrap; -import org.apache.hudi.client.FlinkTaskContextSupplier; -import org.apache.hudi.client.HoodieFlinkWriteClient; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordGlobalLocation; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.BaseFileUtils; -import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunction; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.format.FormatUtils; +import org.apache.hudi.util.FlinkTables; import org.apache.hudi.util.StreamerUtil; import org.apache.avro.Schema; @@ -54,6 +49,7 @@ import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.runtime.state.StateInitializationContext; import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; import org.apache.flink.streaming.api.operators.AbstractStreamOperator; import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; @@ -64,6 +60,7 @@ import java.util.Collections; import java.util.Iterator; import java.util.List; +import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import static java.util.stream.Collectors.toList; @@ -77,7 +74,7 @@ * *

The output records should then shuffle by the recordKey and thus do scalable write. */ -public class BootstrapOperator +public class BootstrapOperator> extends AbstractStreamOperator implements OneInputStreamOperator { private static final Logger LOG = LoggerFactory.getLogger(BootstrapOperator.class); @@ -89,11 +86,11 @@ public class BootstrapOperator protected transient org.apache.hadoop.conf.Configuration hadoopConf; protected transient HoodieWriteConfig writeConfig; + private transient GlobalAggregateManager aggregateManager; + private transient ListState instantState; private final Pattern pattern; private String lastInstantTime; - private HoodieFlinkWriteClient writeClient; - private String actionType; public BootstrapOperator(Configuration conf) { this.conf = conf; @@ -102,7 +99,8 @@ public BootstrapOperator(Configuration conf) { @Override public void snapshotState(StateSnapshotContext context) throws Exception { - lastInstantTime = this.writeClient.getLastPendingInstant(this.actionType); + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(this.conf); + lastInstantTime = StreamerUtil.getLastPendingInstant(metaClient); instantState.update(Collections.singletonList(lastInstantTime)); } @@ -122,13 +120,17 @@ public void initializeState(StateInitializationContext context) throws Exception } this.hadoopConf = StreamerUtil.getHadoopConf(); - this.writeConfig = StreamerUtil.getHoodieClientConfig(this.conf); - this.hoodieTable = getTable(); - this.writeClient = StreamerUtil.createWriteClient(this.conf, getRuntimeContext()); - this.actionType = CommitUtils.getCommitActionType( - WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)), - HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE))); + this.writeConfig = StreamerUtil.getHoodieClientConfig(this.conf, true); + this.hoodieTable = FlinkTables.createTable(writeConfig, hadoopConf, getRuntimeContext()); + this.aggregateManager = getRuntimeContext().getGlobalAggregateManager(); + + preLoadIndexRecords(); + } + /** + * Load the index records before {@link #processElement}. + */ + protected void preLoadIndexRecords() throws Exception { String basePath = hoodieTable.getMetaClient().getBasePath(); int taskID = getRuntimeContext().getIndexOfThisSubtask(); LOG.info("Start loading records in table {} into the index state, taskId = {}", basePath, taskID); @@ -139,6 +141,27 @@ public void initializeState(StateInitializationContext context) throws Exception } LOG.info("Finish sending index records, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask()); + + // wait for the other bootstrap tasks finish bootstrapping. + waitForBootstrapReady(getRuntimeContext().getIndexOfThisSubtask()); + } + + /** + * Wait for other bootstrap tasks to finish the index bootstrap. + */ + private void waitForBootstrapReady(int taskID) { + int taskNum = getRuntimeContext().getNumberOfParallelSubtasks(); + int readyTaskNum = 1; + while (taskNum != readyTaskNum) { + try { + readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME, taskID, new BootstrapAggFunction()); + LOG.info("Waiting for other bootstrap tasks to complete, taskId = {}.", taskID); + + TimeUnit.SECONDS.sleep(5); + } catch (Exception e) { + LOG.warn("Update global task bootstrap summary error", e); + } + } } @Override @@ -147,13 +170,6 @@ public void processElement(StreamRecord element) throws Exception { output.collect((StreamRecord) element); } - private HoodieFlinkTable getTable() { - HoodieFlinkEngineContext context = new HoodieFlinkEngineContext( - new SerializableConfiguration(this.hadoopConf), - new FlinkTaskContextSupplier(getRuntimeContext())); - return HoodieFlinkTable.create(this.writeConfig, context); - } - /** * Loads all the indices of give partition path into the backup state. * @@ -242,7 +258,7 @@ public static HoodieRecord generateHoodieRecord(HoodieKey hoodieKey, FileSlice f return hoodieRecord; } - private static boolean shouldLoadFile(String fileId, + protected boolean shouldLoadFile(String fileId, int maxParallelism, int parallelism, int taskID) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAccumulator.java similarity index 50% rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java rename to hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAccumulator.java index 2e6064a408724..14630a1f89b72 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAccumulator.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,20 +16,38 @@ * limitations under the License. */ -package org.apache.hudi.table.upgrade; +package org.apache.hudi.sink.bootstrap.aggregate; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; +import java.io.Serializable; +import java.util.HashSet; +import java.util.Set; /** - * Downgrade handle to assist in downgrading hoodie table from version 1 to 0. + * Bootstrap ready task id accumulator. */ -public class OneToZeroDowngradeHandler extends BaseOneToZeroDowngradeHandler { +public class BootstrapAccumulator implements Serializable { + private static final long serialVersionUID = 1L; - @Override - HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { - return HoodieSparkTable.create(config, context); + private final Set readyTaskSet; + + public BootstrapAccumulator() { + this.readyTaskSet = new HashSet<>(); + } + + public void update(int taskId) { + readyTaskSet.add(taskId); + } + + public int readyTaskNum() { + return readyTaskSet.size(); + } + + public BootstrapAccumulator merge(BootstrapAccumulator acc) { + if (acc == null) { + return this; + } + + readyTaskSet.addAll(acc.readyTaskSet); + return this; } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAggFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAggFunction.java new file mode 100644 index 0000000000000..8c42fe903ad3c --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAggFunction.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bootstrap.aggregate; + +import org.apache.flink.api.common.functions.AggregateFunction; + +/** + * Aggregate function that accumulates the loaded task number of + * function {@link org.apache.hudi.sink.bootstrap.BootstrapOperator}. + */ +public class BootstrapAggFunction implements AggregateFunction { + public static final String NAME = BootstrapAggFunction.class.getSimpleName(); + + @Override + public BootstrapAccumulator createAccumulator() { + return new BootstrapAccumulator(); + } + + @Override + public BootstrapAccumulator add(Integer taskId, BootstrapAccumulator bootstrapAccumulator) { + bootstrapAccumulator.update(taskId); + return bootstrapAccumulator; + } + + @Override + public Integer getResult(BootstrapAccumulator bootstrapAccumulator) { + return bootstrapAccumulator.readyTaskNum(); + } + + @Override + public BootstrapAccumulator merge(BootstrapAccumulator bootstrapAccumulator, BootstrapAccumulator acc) { + return bootstrapAccumulator.merge(acc); + } +} \ No newline at end of file diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/batch/BatchBootstrapOperator.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/batch/BatchBootstrapOperator.java index ac4c2b1798876..ead00d40a936d 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/batch/BatchBootstrapOperator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/batch/BatchBootstrapOperator.java @@ -39,7 +39,7 @@ * *

The input records should shuffle by the partition path to avoid repeated loading. */ -public class BatchBootstrapOperator +public class BatchBootstrapOperator> extends BootstrapOperator { private Set partitionPathSet; @@ -56,10 +56,15 @@ public void open() throws Exception { this.haveSuccessfulCommits = StreamerUtil.haveSuccessfulCommits(hoodieTable.getMetaClient()); } + @Override + protected void preLoadIndexRecords() { + // no operation + } + @Override @SuppressWarnings("unchecked") public void processElement(StreamRecord element) throws Exception { - final HoodieRecord record = (HoodieRecord) element.getValue(); + final HoodieRecord record = (HoodieRecord) element.getValue(); final String partitionPath = record.getKey().getPartitionPath(); if (haveSuccessfulCommits && !partitionPathSet.contains(partitionPath)) { @@ -70,4 +75,10 @@ public void processElement(StreamRecord element) throws Exception { // send the trigger record output.collect((StreamRecord) element); } + + @Override + protected boolean shouldLoadFile(String fileId, int maxParallelism, int parallelism, int taskID) { + // load all the file groups in the partition + return true; + } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteFunction.java index 7fce5c0a370c5..4089907243c87 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteFunction.java @@ -20,9 +20,8 @@ import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.sink.StreamWriteOperatorCoordinator; import org.apache.hudi.sink.common.AbstractWriteFunction; @@ -80,6 +79,11 @@ public class BulkInsertWriteFunction */ private int taskID; + /** + * Meta Client. + */ + private transient HoodieTableMetaClient metaClient; + /** * Write Client. */ @@ -95,11 +99,6 @@ public class BulkInsertWriteFunction */ private transient OperatorEventGateway eventGateway; - /** - * Commit action type. - */ - private transient String actionType; - /** * Constructs a StreamingSinkFunction. * @@ -113,12 +112,9 @@ public BulkInsertWriteFunction(Configuration config, RowType rowType) { @Override public void open(Configuration parameters) throws IOException { this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.metaClient = StreamerUtil.createMetaClient(this.config); this.writeClient = StreamerUtil.createWriteClient(this.config, getRuntimeContext()); - this.actionType = CommitUtils.getCommitActionType( - WriteOperationType.fromValue(config.getString(FlinkOptions.OPERATION)), - HoodieTableType.valueOf(config.getString(FlinkOptions.TABLE_TYPE))); - - this.initInstant = this.writeClient.getLastPendingInstant(this.actionType); + this.initInstant = StreamerUtil.getLastPendingInstant(this.metaClient, false); sendBootstrapEvent(); initWriterHelper(); } @@ -187,8 +183,15 @@ private void sendBootstrapEvent() { LOG.info("Send bootstrap write metadata event to coordinator, task[{}].", taskID); } + /** + * Returns the last pending instant time. + */ + protected String lastPendingInstant() { + return StreamerUtil.getLastPendingInstant(this.metaClient); + } + private String instantToWrite() { - String instant = this.writeClient.getLastPendingInstant(this.actionType); + String instant = lastPendingInstant(); // if exactly-once semantics turns on, // waits for the checkpoint notification until the checkpoint timeout threshold hits. TimeWait timeWait = TimeWait.builder() @@ -202,7 +205,7 @@ private String instantToWrite() { // sleep for a while timeWait.waitFor(); // refresh the inflight instant - instant = this.writeClient.getLastPendingInstant(this.actionType); + instant = lastPendingInstant(); } return instant; } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriterHelper.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriterHelper.java index e0cbab60289af..4bc8ae27fb5d5 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriterHelper.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriterHelper.java @@ -181,7 +181,7 @@ public List getWriteStatuses(int taskID) { return getHoodieWriteStatuses().stream() .map(BulkInsertWriterHelper::toWriteStatus).collect(Collectors.toList()); } catch (IOException e) { - throw new HoodieException("Error collect the write status for task [" + taskID + "]"); + throw new HoodieException("Error collect the write status for task [" + taskID + "]", e); } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java index 654f0b86449ec..0e7300591286f 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java @@ -20,9 +20,7 @@ import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.sink.StreamWriteOperatorCoordinator; @@ -70,6 +68,11 @@ public abstract class AbstractStreamWriteFunction */ protected int taskID; + /** + * Meta Client. + */ + protected transient HoodieTableMetaClient metaClient; + /** * Write Client. */ @@ -85,11 +88,6 @@ public abstract class AbstractStreamWriteFunction */ protected transient OperatorEventGateway eventGateway; - /** - * Commit action type. - */ - protected transient String actionType; - /** * Flag saying whether the write task is waiting for the checkpoint success notification * after it finished a checkpoint. @@ -128,11 +126,8 @@ public AbstractStreamWriteFunction(Configuration config) { @Override public void initializeState(FunctionInitializationContext context) throws Exception { this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.metaClient = StreamerUtil.createMetaClient(this.config); this.writeClient = StreamerUtil.createWriteClient(this.config, getRuntimeContext()); - this.actionType = CommitUtils.getCommitActionType( - WriteOperationType.fromValue(config.getString(FlinkOptions.OPERATION)), - HoodieTableType.valueOf(config.getString(FlinkOptions.TABLE_TYPE))); - this.writeStatuses = new ArrayList<>(); this.writeMetadataState = context.getOperatorStateStore().getListState( new ListStateDescriptor<>( @@ -140,7 +135,7 @@ public void initializeState(FunctionInitializationContext context) throws Except TypeInformation.of(WriteMetadataEvent.class) )); - this.currentInstant = this.writeClient.getLastPendingInstant(this.actionType); + this.currentInstant = lastPendingInstant(); if (context.isRestored()) { restoreWriteMetadata(); } else { @@ -162,12 +157,6 @@ public void snapshotState(FunctionSnapshotContext functionSnapshotContext) throw // ------------------------------------------------------------------------- // Getter/Setter // ------------------------------------------------------------------------- - @VisibleForTesting - @SuppressWarnings("rawtypes") - public HoodieFlinkWriteClient getWriteClient() { - return writeClient; - } - @VisibleForTesting public boolean isConfirming() { return this.confirming; @@ -182,7 +171,7 @@ public void setOperatorEventGateway(OperatorEventGateway operatorEventGateway) { // ------------------------------------------------------------------------- private void restoreWriteMetadata() throws Exception { - String lastInflight = this.writeClient.getLastPendingInstant(this.actionType); + String lastInflight = lastPendingInstant(); boolean eventSent = false; for (WriteMetadataEvent event : this.writeMetadataState.get()) { if (Objects.equals(lastInflight, event.getInstantTime())) { @@ -224,6 +213,13 @@ public void handleOperatorEvent(OperatorEvent event) { this.confirming = false; } + /** + * Returns the last pending instant time. + */ + protected String lastPendingInstant() { + return StreamerUtil.getLastPendingInstant(this.metaClient); + } + /** * Prepares the instant time to write with for next checkpoint. * @@ -231,7 +227,7 @@ public void handleOperatorEvent(OperatorEvent event) { * @return The instant time */ protected String instantToWrite(boolean hasData) { - String instant = this.writeClient.getLastPendingInstant(this.actionType); + String instant = lastPendingInstant(); // if exactly-once semantics turns on, // waits for the checkpoint notification until the checkpoint timeout threshold hits. TimeWait timeWait = TimeWait.builder() @@ -246,9 +242,9 @@ protected String instantToWrite(boolean hasData) { // sleep for a while timeWait.waitFor(); // refresh the inflight instant - instant = this.writeClient.getLastPendingInstant(this.actionType); + instant = lastPendingInstant(); } else { - // the inflight instant changed, which means the last instant was committed + // the pending instant changed, that means the last instant was committed // successfully. confirming = false; } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java index 5916244293347..560b5ffbad305 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java @@ -22,7 +22,8 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.CompactionOperation; import org.apache.hudi.sink.utils.NonThrownExecutor; -import org.apache.hudi.table.action.compact.FlinkCompactHelpers; +import org.apache.hudi.table.HoodieFlinkCopyOnWriteTable; +import org.apache.hudi.table.action.compact.HoodieFlinkMergeOnReadTableCompactor; import org.apache.hudi.util.StreamerUtil; import org.apache.flink.annotation.VisibleForTesting; @@ -77,7 +78,7 @@ public void open(Configuration parameters) throws Exception { this.taskID = getRuntimeContext().getIndexOfThisSubtask(); this.writeClient = StreamerUtil.createWriteClient(conf, getRuntimeContext()); if (this.asyncCompaction) { - this.executor = new NonThrownExecutor(LOG); + this.executor = NonThrownExecutor.builder(LOG).build(); } } @@ -89,6 +90,7 @@ public void processElement(CompactionPlanEvent event, Context context, Collector // executes the compaction task asynchronously to not block the checkpoint barrier propagate. executor.execute( () -> doCompaction(instantTime, compactionOperation, collector), + (errMsg, t) -> collector.collect(new CompactionCommitEvent(instantTime, compactionOperation.getFileId(), taskID)), "Execute compaction for instant %s from task %d", instantTime, taskID); } else { // executes the compaction task synchronously for batch mode. @@ -98,8 +100,18 @@ public void processElement(CompactionPlanEvent event, Context context, Collector } private void doCompaction(String instantTime, CompactionOperation compactionOperation, Collector collector) throws IOException { - List writeStatuses = FlinkCompactHelpers.compact(writeClient, instantTime, compactionOperation); - collector.collect(new CompactionCommitEvent(instantTime, writeStatuses, taskID)); + HoodieFlinkMergeOnReadTableCompactor compactor = new HoodieFlinkMergeOnReadTableCompactor(); + List writeStatuses = compactor.compact( + new HoodieFlinkCopyOnWriteTable<>( + writeClient.getConfig(), + writeClient.getEngineContext(), + writeClient.getHoodieTable().getMetaClient()), + writeClient.getHoodieTable().getMetaClient(), + writeClient.getConfig(), + compactionOperation, + instantTime, + writeClient.getHoodieTable().getTaskContextSupplier()); + collector.collect(new CompactionCommitEvent(instantTime, compactionOperation.getFileId(), writeStatuses, taskID)); } @VisibleForTesting diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitEvent.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitEvent.java index 52c0812d8c718..398dfcf6195fb 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitEvent.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitEvent.java @@ -33,6 +33,12 @@ public class CompactionCommitEvent implements Serializable { * The compaction commit instant time. */ private String instant; + + /** + * The file ID. + */ + private String fileId; + /** * The write statuses. */ @@ -45,16 +51,36 @@ public class CompactionCommitEvent implements Serializable { public CompactionCommitEvent() { } - public CompactionCommitEvent(String instant, List writeStatuses, int taskID) { + /** + * An event with NULL write statuses that represents a failed compaction. + */ + public CompactionCommitEvent(String instant, String fileId, int taskID) { + this(instant, fileId, null, taskID); + } + + public CompactionCommitEvent(String instant, String fileId, List writeStatuses, int taskID) { this.instant = instant; + this.fileId = fileId; this.writeStatuses = writeStatuses; this.taskID = taskID; } + public boolean isFailed() { + return this.writeStatuses == null; + } + + // ------------------------------------------------------------------------- + // Getter/Setter + // ------------------------------------------------------------------------- + public void setInstant(String instant) { this.instant = instant; } + public void setFileId(String fileId) { + this.fileId = fileId; + } + public void setWriteStatuses(List writeStatuses) { this.writeStatuses = writeStatuses; } @@ -67,6 +93,10 @@ public String getInstant() { return instant; } + public String getFileId() { + return fileId; + } + public List getWriteStatuses() { return writeStatuses; } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java index e6c4cedaae3ab..0309278f483e1 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java @@ -20,12 +20,12 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.util.CompactionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.sink.CleanFunction; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.CompactionUtil; import org.apache.hudi.util.StreamerUtil; import org.apache.flink.configuration.Configuration; @@ -33,7 +33,6 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; @@ -61,9 +60,17 @@ public class CompactionCommitSink extends CleanFunction { /** * Buffer to collect the event from each compact task {@code CompactFunction}. - * The key is the instant time. + * + *

Stores the mapping of instant_time -> file_id -> event. Use a map to collect the + * events because the rolling back of intermediate compaction tasks generates corrupt + * events. + */ + private transient Map> commitBuffer; + + /** + * The hoodie table. */ - private transient Map> commitBuffer; + private transient HoodieFlinkTable table; public CompactionCommitSink(Configuration conf) { super(conf); @@ -77,14 +84,20 @@ public void open(Configuration parameters) throws Exception { this.writeClient = StreamerUtil.createWriteClient(conf, getRuntimeContext()); } this.commitBuffer = new HashMap<>(); + this.table = this.writeClient.getHoodieTable(); } @Override public void invoke(CompactionCommitEvent event, Context context) throws Exception { final String instant = event.getInstant(); - commitBuffer.computeIfAbsent(instant, k -> new ArrayList<>()) - .add(event); - commitIfNecessary(instant, commitBuffer.get(instant)); + if (event.isFailed()) { + // handle failure case + CompactionUtil.rollbackCompaction(table, event.getInstant()); + return; + } + commitBuffer.computeIfAbsent(instant, k -> new HashMap<>()) + .put(event.getFileId(), event); + commitIfNecessary(instant, commitBuffer.get(instant).values()); } /** @@ -94,39 +107,38 @@ public void invoke(CompactionCommitEvent event, Context context) throws Exceptio * @param instant Compaction commit instant time * @param events Commit events ever received for the instant */ - private void commitIfNecessary(String instant, List events) throws IOException { + private void commitIfNecessary(String instant, Collection events) throws IOException { HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan( this.writeClient.getHoodieTable().getMetaClient(), instant); boolean isReady = compactionPlan.getOperations().size() == events.size(); if (!isReady) { return; } + try { + doCommit(instant, events); + } catch (Throwable throwable) { + // make it fail-safe + LOG.error("Error while committing compaction instant: " + instant, throwable); + } finally { + // reset the status + reset(instant); + } + } + + @SuppressWarnings("unchecked") + private void doCommit(String instant, Collection events) throws IOException { List statuses = events.stream() .map(CompactionCommitEvent::getWriteStatuses) .flatMap(Collection::stream) .collect(Collectors.toList()); - if (this.writeClient.getConfig().shouldAutoCommit()) { - // Prepare the commit metadata. - List updateStatusMap = statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()); - HoodieCommitMetadata metadata = new HoodieCommitMetadata(true); - for (HoodieWriteStat stat : updateStatusMap) { - metadata.addWriteStat(stat.getPartitionPath(), stat); - } - metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, writeClient.getConfig().getSchema()); - this.writeClient.completeCompaction( - metadata, statuses, this.writeClient.getHoodieTable(), instant); - } // commit the compaction this.writeClient.commitCompaction(instant, statuses, Option.empty()); - // Whether to cleanup the old log file when compaction + // Whether to clean up the old log file when compaction if (!conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) { this.writeClient.clean(); } - - // reset the status - reset(instant); } private void reset(String instant) { diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java index a17ea04040046..f6dd241ec069c 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java @@ -19,7 +19,6 @@ package org.apache.hudi.sink.compact; import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.common.model.CompactionOperation; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -27,7 +26,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.table.HoodieFlinkTable; import org.apache.hudi.util.CompactionUtil; -import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.util.FlinkTables; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.configuration.Configuration; @@ -38,7 +37,6 @@ import java.io.IOException; import java.util.List; -import java.util.Objects; import static java.util.stream.Collectors.toList; @@ -56,14 +54,10 @@ public class CompactionPlanOperator extends AbstractStreamOperator streamRecord) { @Override public void notifyCheckpointComplete(long checkpointId) { try { - HoodieFlinkTable hoodieTable = writeClient.getHoodieTable(); - CompactionUtil.rollbackCompaction(hoodieTable, writeClient, conf); - scheduleCompaction(hoodieTable, checkpointId); + table.getMetaClient().reloadActiveTimeline(); + // There is no good way to infer when the compaction task for an instant crushed + // or is still undergoing. So we use a configured timeout threshold to control the rollback: + // {@code FlinkOptions.COMPACTION_TIMEOUT_SECONDS}, + // when the earliest inflight instant has timed out, assumes it has failed + // already and just rolls it back. + + // comment out: do we really need the timeout rollback ? + // CompactionUtil.rollbackEarliestCompaction(table, conf); + scheduleCompaction(table, checkpointId); } catch (Throwable throwable) { - // make it fail safe - LOG.error("Error while scheduling compaction at instant: " + compactionInstantTime, throwable); + // make it fail-safe + LOG.error("Error while scheduling compaction plan for checkpoint: " + checkpointId, throwable); } } private void scheduleCompaction(HoodieFlinkTable table, long checkpointId) throws IOException { // the last instant takes the highest priority. - Option lastRequested = table.getActiveTimeline().filterPendingCompactionTimeline() - .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED).lastInstant(); - if (!lastRequested.isPresent()) { + Option firstRequested = table.getActiveTimeline().filterPendingCompactionTimeline() + .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED).firstInstant(); + if (!firstRequested.isPresent()) { // do nothing. LOG.info("No compaction plan for checkpoint " + checkpointId); return; } - String compactionInstantTime = lastRequested.get().getTimestamp(); - if (this.compactionInstantTime != null - && Objects.equals(this.compactionInstantTime, compactionInstantTime)) { - // do nothing - LOG.info("Duplicate scheduling for compaction instant: " + compactionInstantTime + ", ignore"); - return; - } + String compactionInstantTime = firstRequested.get().getTimestamp(); // generate compaction plan // should support configurable commit metadata @@ -118,9 +117,8 @@ private void scheduleCompaction(HoodieFlinkTable table, long checkpointId) th if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) { // do nothing. - LOG.info("No compaction plan for checkpoint " + checkpointId + " and instant " + compactionInstantTime); + LOG.info("Empty compaction plan for instant " + compactionInstantTime); } else { - this.compactionInstantTime = compactionInstantTime; HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); // Mark instant as compaction inflight table.getActiveTimeline().transitionCompactionRequestedToInflight(instant); @@ -128,7 +126,7 @@ private void scheduleCompaction(HoodieFlinkTable table, long checkpointId) th List operations = compactionPlan.getOperations().stream() .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList()); - LOG.info("CompactionPlanOperator compacting " + operations + " files"); + LOG.info("Execute compaction plan for instant {} as {} file groups", compactionInstantTime, operations.size()); for (CompactionOperation operation : operations) { output.collect(new StreamRecord<>(new CompactionPlanEvent(compactionInstantTime, operation))); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java index 18d49f1be359d..aebcc7d6ee98f 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java @@ -66,7 +66,10 @@ public static void main(String[] args) throws Exception { // set table schema CompactionUtil.setAvroSchema(conf, metaClient); - HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf, null); + // infer changelog mode + CompactionUtil.inferChangelogMode(conf, metaClient); + + HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf); HoodieFlinkTable table = writeClient.getHoodieTable(); // judge whether have operation @@ -96,7 +99,7 @@ public static void main(String[] args) throws Exception { HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); if (timeline.containsInstant(inflightInstant)) { LOG.info("Rollback inflight compaction instant: [" + compactionInstantTime + "]"); - writeClient.rollbackInflightCompaction(inflightInstant, table); + table.rollbackInflightCompaction(inflightInstant); table.getMetaClient().reloadActiveTimeline(); } @@ -148,5 +151,6 @@ public static void main(String[] args) throws Exception { .setParallelism(1); env.execute("flink_hudi_compaction"); + writeClient.close(); } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/event/CommitAckEvent.java b/hudi-flink/src/main/java/org/apache/hudi/sink/event/CommitAckEvent.java index 541fd062f8ff7..84274f0e2eb42 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/event/CommitAckEvent.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/event/CommitAckEvent.java @@ -26,13 +26,25 @@ public class CommitAckEvent implements OperatorEvent { private static final long serialVersionUID = 1L; - private static final CommitAckEvent INSTANCE = new CommitAckEvent(); + private long checkpointId; + + public CommitAckEvent(long checkpointId) { + this.checkpointId = checkpointId; + } // default constructor for efficient serialization public CommitAckEvent() { } - public static CommitAckEvent getInstance() { - return INSTANCE; + public long getCheckpointId() { + return checkpointId; + } + + public void setCheckpointId(long checkpointId) { + this.checkpointId = checkpointId; + } + + public static CommitAckEvent getInstance(long checkpointId) { + return new CommitAckEvent(checkpointId); } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java index 15dbae6df4e15..cff24d97f1d6a 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java @@ -116,7 +116,7 @@ public BucketAssignFunction(Configuration conf) { @Override public void open(Configuration parameters) throws Exception { super.open(parameters); - HoodieWriteConfig writeConfig = StreamerUtil.getHoodieClientConfig(this.conf); + HoodieWriteConfig writeConfig = StreamerUtil.getHoodieClientConfig(this.conf, true); HoodieFlinkEngineContext context = new HoodieFlinkEngineContext( new SerializableConfiguration(StreamerUtil.getHadoopConf()), new FlinkTaskContextSupplier(getRuntimeContext())); diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigner.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigner.java index 43168aef0a278..f9d5b1c1faa9b 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigner.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigner.java @@ -94,6 +94,11 @@ public class BucketAssigner implements AutoCloseable { */ private final Map newFileAssignStates; + /** + * Num of accumulated successful checkpoints, used for cleaning the new file assign state. + */ + private int accCkp = 0; + public BucketAssigner( int taskID, int maxParallelism, @@ -117,7 +122,6 @@ public BucketAssigner( */ public void reset() { bucketInfoMap.clear(); - newFileAssignStates.clear(); } public BucketInfo addUpdate(String partitionPath, String fileIdHint) { @@ -136,16 +140,7 @@ public BucketInfo addInsert(String partitionPath) { // first try packing this into one of the smallFiles if (smallFileAssign != null && smallFileAssign.assign()) { - final String key = StreamerUtil.generateBucketKey(partitionPath, smallFileAssign.getFileId()); - // create a new bucket or reuse an existing bucket - BucketInfo bucketInfo; - if (bucketInfoMap.containsKey(key)) { - // Assigns an inserts to existing update bucket - bucketInfo = bucketInfoMap.get(key); - } else { - bucketInfo = addUpdate(partitionPath, smallFileAssign.getFileId()); - } - return bucketInfo; + return new BucketInfo(BucketType.UPDATE, smallFileAssign.getFileId(), partitionPath); } // if we have anything more, create new insert buckets, like normal @@ -154,7 +149,20 @@ public BucketInfo addInsert(String partitionPath) { if (newFileAssignState.canAssign()) { newFileAssignState.assign(); final String key = StreamerUtil.generateBucketKey(partitionPath, newFileAssignState.fileId); - return bucketInfoMap.get(key); + if (bucketInfoMap.containsKey(key)) { + // the newFileAssignStates is cleaned asynchronously when received the checkpoint success notification, + // the records processed within the time range: + // (start checkpoint, checkpoint success(and instant committed)) + // should still be assigned to the small buckets of last checkpoint instead of new one. + + // the bucketInfoMap is cleaned when checkpoint starts. + + // A promotion: when the HoodieRecord can record whether it is an UPDATE or INSERT, + // we can always return an UPDATE BucketInfo here, and there is no need to record the + // UPDATE bucket through calling #addUpdate. + return bucketInfoMap.get(key); + } + return new BucketInfo(BucketType.UPDATE, newFileAssignState.fileId, partitionPath); } } BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, createFileIdOfThisTask(), partitionPath); @@ -166,7 +174,7 @@ public BucketInfo addInsert(String partitionPath) { return bucketInfo; } - private SmallFileAssign getSmallFileAssign(String partitionPath) { + private synchronized SmallFileAssign getSmallFileAssign(String partitionPath) { if (smallFileAssignMap.containsKey(partitionPath)) { return smallFileAssignMap.get(partitionPath); } @@ -186,7 +194,19 @@ private SmallFileAssign getSmallFileAssign(String partitionPath) { /** * Refresh the table state like TableFileSystemView and HoodieTimeline. */ - public void reload(long checkpointId) { + public synchronized void reload(long checkpointId) { + this.accCkp += 1; + if (this.accCkp > 1) { + // do not clean the new file assignment state for the first checkpoint, + // this #reload calling is triggered by checkpoint success event, the coordinator + // also relies on the checkpoint success event to commit the inflight instant, + // and very possibly this component receives the notification before the coordinator, + // if we do the cleaning, the records processed within the time range: + // (start checkpoint, checkpoint success(and instant committed)) + // would be assigned to a fresh new data bucket which is not the right behavior. + this.newFileAssignStates.clear(); + this.accCkp = 0; + } this.smallFileAssignMap.clear(); this.writeProfile.reload(checkpointId); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java index 6b5e96eb83263..922c056d259de 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.AbstractTableFileSystemView; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.action.commit.SmallFile; @@ -55,7 +56,7 @@ protected List smallFilesProfile(String partitionPath) { if (!commitTimeline.empty()) { HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); // initialize the filesystem view based on the commit metadata - initFSViewIfNecessary(commitTimeline); + initFileSystemView(); // find smallest file in partition and append to it List allSmallFileSlices = new ArrayList<>(); // If we can index log files, we can add more inserts to log files for fileIds including those under @@ -90,6 +91,10 @@ protected List smallFilesProfile(String partitionPath) { return smallFileLocations; } + protected AbstractTableFileSystemView getFileSystemView() { + return (AbstractTableFileSystemView) this.table.getSliceView(); + } + private long getTotalFileSize(FileSlice fileSlice) { if (!fileSlice.getBaseFile().isPresent()) { return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList())); diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/EmptyWriteProfile.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/EmptyWriteProfile.java index 3cdd798e2e841..e0a6fc1f4a336 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/EmptyWriteProfile.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/EmptyWriteProfile.java @@ -28,11 +28,8 @@ /** * WriteProfile that always return empty small files. * - *

This write profile is used for cases: - * i). INSERT OVERWRITE and INSERT OVERWRITE TABLE operations, - * the existing small files are ignored because of the 'OVERWRITE' semantics; - * ii). INSERT operation when data file merge is disabled. - * + *

This write profile is used for INSERT OVERWRITE and INSERT OVERWRITE TABLE operations, + * the existing small files are ignored because of the 'OVERWRITE' semantics. * *

Note: assumes the index can always index log files for Flink write. */ diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java index 4d953c29f7207..1171a54cde92c 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java @@ -25,18 +25,15 @@ import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.view.AbstractTableFileSystemView; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.sink.partitioner.BucketAssigner; import org.apache.hudi.table.HoodieFlinkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.commit.SmallFile; -import org.apache.hudi.util.StreamerUtil; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.core.fs.Path; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,7 +42,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -98,12 +94,7 @@ public class WriteProfile { /** * The file system view cache for one checkpoint interval. */ - protected HoodieTableFileSystemView fsView; - - /** - * Hadoop configuration. - */ - private final Configuration hadoopConf; + protected AbstractTableFileSystemView fsView; /** * Metadata cache to reduce IO of metadata files. @@ -116,7 +107,6 @@ public WriteProfile(HoodieWriteConfig config, HoodieFlinkEngineContext context) this.smallFilesMap = new HashMap<>(); this.recordsPerBucket = config.getCopyOnWriteInsertSplitSize(); this.table = HoodieFlinkTable.create(config, context); - this.hadoopConf = StreamerUtil.getHadoopConf(); this.metadataCache = new HashMap<>(); // profile the record statistics on construction recordProfile(); @@ -194,7 +184,7 @@ protected List smallFilesProfile(String partitionPath) { if (!commitTimeline.empty()) { // if we have some commits HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); // initialize the filesystem view based on the commit metadata - initFSViewIfNecessary(commitTimeline); + initFileSystemView(); List allFiles = fsView .getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList()); @@ -214,22 +204,16 @@ protected List smallFilesProfile(String partitionPath) { } @VisibleForTesting - public void initFSViewIfNecessary(HoodieTimeline commitTimeline) { + public void initFileSystemView() { if (fsView == null) { - cleanMetadataCache(commitTimeline.getInstants()); - List metadataList = commitTimeline.getInstants() - .map(instant -> - this.metadataCache.computeIfAbsent( - instant.getTimestamp(), - k -> WriteProfiles.getCommitMetadataSafely(config.getTableName(), basePath, instant, commitTimeline) - .orElse(null))) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - FileStatus[] commitFiles = WriteProfiles.getWritePathsOfInstants(basePath, hadoopConf, metadataList); - fsView = new HoodieTableFileSystemView(table.getMetaClient(), commitTimeline, commitFiles); + fsView = getFileSystemView(); } } + protected AbstractTableFileSystemView getFileSystemView() { + return (AbstractTableFileSystemView) this.table.getBaseFileOnlyView(); + } + /** * Remove the overdue metadata from the cache * whose instant does not belong to the given instants {@code instants}. @@ -261,8 +245,10 @@ public synchronized void reload(long checkpointId) { return; } this.table.getMetaClient().reloadActiveTimeline(); + this.table.getHoodieView().sync(); recordProfile(); this.fsView = null; + cleanMetadataCache(this.table.getMetaClient().getCommitsTimeline().filterCompletedInstants().getInstants()); this.smallFilesMap.clear(); this.reloadedCheckpointId = checkpointId; } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java index 0ab8f12de9cc3..e8aafd830f10f 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java @@ -21,11 +21,13 @@ import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.util.StreamerUtil; import org.apache.flink.core.fs.Path; @@ -37,11 +39,9 @@ import java.io.FileNotFoundException; import java.io.IOException; -import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.stream.Collectors; /** @@ -83,57 +83,67 @@ public static void clean(String path) { } /** - * Returns all the incremental write file path statuses with the given commits metadata. + * Returns all the incremental write file statuses with the given commits metadata. * * @param basePath Table base path * @param hadoopConf The hadoop conf * @param metadataList The commits metadata - * @return the file statuses array + * @param tableType The table type + * @return the file status array */ public static FileStatus[] getWritePathsOfInstants( Path basePath, Configuration hadoopConf, - List metadataList) { + List metadataList, + HoodieTableType tableType) { FileSystem fs = FSUtils.getFs(basePath.toString(), hadoopConf); - return metadataList.stream().map(metadata -> getWritePathsOfInstant(basePath, metadata, fs)) - .flatMap(Collection::stream).toArray(FileStatus[]::new); + Map uniqueIdToFileStatus = new HashMap<>(); + metadataList.forEach(metadata -> + uniqueIdToFileStatus.putAll(getFilesToReadOfInstant(basePath, metadata, fs, tableType))); + return uniqueIdToFileStatus.values().toArray(new FileStatus[0]); } /** - * Returns the commit file paths with given metadata. + * Returns the commit file status info with given metadata. * - * @param basePath Table base path - * @param metadata The metadata - * @param fs The filesystem - * @return the commit file status list + * @param basePath Table base path + * @param metadata The metadata + * @param fs The filesystem + * @param tableType The table type + * @return the commit file status info grouping by specific ID */ - private static List getWritePathsOfInstant(Path basePath, HoodieCommitMetadata metadata, FileSystem fs) { - return metadata.getFileIdAndFullPaths(basePath.toString()).values().stream() - .map(org.apache.hadoop.fs.Path::new) + private static Map getFilesToReadOfInstant( + Path basePath, + HoodieCommitMetadata metadata, + FileSystem fs, + HoodieTableType tableType) { + return getFilesToRead(metadata, basePath.toString(), tableType).entrySet().stream() // filter out the file paths that does not exist, some files may be cleaned by // the cleaner. - .filter(path -> { - try { - return fs.exists(path); - } catch (IOException e) { - LOG.error("Checking exists of path: {} error", path); - throw new HoodieException(e); - } - }).map(path -> { + .filter(entry -> { try { - return fs.getFileStatus(path); - } catch (FileNotFoundException fe) { - LOG.warn("File {} was deleted by the cleaner, ignore", path); - return null; + return fs.exists(entry.getValue().getPath()); } catch (IOException e) { - LOG.error("Get write status of path: {} error", path); + LOG.error("Checking exists of path: {} error", entry.getValue().getPath()); throw new HoodieException(e); } }) - // filter out crushed files - .filter(Objects::nonNull) - .filter(StreamerUtil::isValidFile) - .collect(Collectors.toList()); + .filter(entry -> StreamerUtil.isValidFile(entry.getValue())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + private static Map getFilesToRead( + HoodieCommitMetadata metadata, + String basePath, + HoodieTableType tableType) { + switch (tableType) { + case COPY_ON_WRITE: + return metadata.getFileIdToFileStatus(basePath); + case MERGE_ON_READ: + return metadata.getFullPathToFileStatus(basePath); + default: + throw new AssertionError(); + } } /** @@ -178,9 +188,8 @@ public static HoodieCommitMetadata getCommitMetadata( Path basePath, HoodieInstant instant, HoodieTimeline timeline) { - byte[] data = timeline.getInstantDetails(instant).get(); try { - return HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); + return HoodieInputFormatUtils.getCommitMetadata(instant, timeline); } catch (IOException e) { LOG.error("Get write metadata for table {} with instant {} and path: {} error", tableName, instant.getTimestamp(), basePath); diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/CoordinatorExecutor.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/CoordinatorExecutor.java deleted file mode 100644 index 761d03d58c4b7..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/CoordinatorExecutor.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.sink.utils; - -import org.apache.hudi.exception.HoodieException; - -import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; -import org.slf4j.Logger; - -/** - * Coordinator executor that executes the tasks asynchronously, it fails the job - * for any task exceptions. - * - *

We need this because the coordinator methods are called by - * the Job Manager's main thread (mailbox thread), executes the methods asynchronously - * to avoid blocking the main thread. - */ -public class CoordinatorExecutor extends NonThrownExecutor { - private final OperatorCoordinator.Context context; - - public CoordinatorExecutor(OperatorCoordinator.Context context, Logger logger) { - super(logger, true); - this.context = context; - } - - @Override - protected void exceptionHook(String actionString, Throwable t) { - this.context.failJob(new HoodieException(actionString, t)); - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java index d374882e9e6ed..1c051c8cd2300 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java @@ -79,7 +79,6 @@ private static HiveSyncConfig buildSyncConfig(Configuration conf) { hiveSyncConfig.partitionValueExtractorClass = conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME); hiveSyncConfig.useJdbc = conf.getBoolean(FlinkOptions.HIVE_SYNC_USE_JDBC); hiveSyncConfig.useFileListingFromMetadata = conf.getBoolean(FlinkOptions.METADATA_ENABLED); - hiveSyncConfig.verifyMetadataFileListing = false; hiveSyncConfig.ignoreExceptions = conf.getBoolean(FlinkOptions.HIVE_SYNC_IGNORE_EXCEPTIONS); hiveSyncConfig.supportTimestamp = conf.getBoolean(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP); hiveSyncConfig.autoCreateDatabase = conf.getBoolean(FlinkOptions.HIVE_SYNC_AUTO_CREATE_DB); diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/NonThrownExecutor.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/NonThrownExecutor.java index 446cb854cab7a..242b3ee0d8b08 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/NonThrownExecutor.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/NonThrownExecutor.java @@ -18,16 +18,23 @@ package org.apache.hudi.sink.utils; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.function.ThrowingRunnable; import org.slf4j.Logger; +import javax.annotation.Nullable; + +import java.util.Objects; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** * An executor service that catches all the throwable with logging. + * + *

A post-exception hook {@link ExceptionHook} can be defined on construction + * or on each execution. */ public class NonThrownExecutor implements AutoCloseable { private final Logger logger; @@ -37,19 +44,27 @@ public class NonThrownExecutor implements AutoCloseable { */ private final ExecutorService executor; + /** + * Exception hook for post-exception handling. + */ + @VisibleForTesting + protected final ExceptionHook exceptionHook; + /** * Flag saying whether to wait for the tasks finish on #close. */ - private final boolean waitForTaskFinishOnClose; + private final boolean waitForTasksFinish; - public NonThrownExecutor(Logger logger, boolean waitForTaskFinishOnClose) { + @VisibleForTesting + protected NonThrownExecutor(Logger logger, @Nullable ExceptionHook exceptionHook, boolean waitForTasksFinish) { this.executor = Executors.newSingleThreadExecutor(); this.logger = logger; - this.waitForTaskFinishOnClose = waitForTaskFinishOnClose; + this.exceptionHook = exceptionHook; + this.waitForTasksFinish = waitForTasksFinish; } - public NonThrownExecutor(Logger logger) { - this(logger, false); + public static Builder builder(Logger logger) { + return new Builder(logger); } /** @@ -59,6 +74,17 @@ public void execute( final ThrowingRunnable action, final String actionName, final Object... actionParams) { + execute(action, this.exceptionHook, actionName, actionParams); + } + + /** + * Run the action in a loop. + */ + public void execute( + final ThrowingRunnable action, + final ExceptionHook hook, + final String actionName, + final Object... actionParams) { executor.execute( () -> { @@ -73,19 +99,17 @@ public void execute( ExceptionUtils.rethrowIfFatalErrorOrOOM(t); final String errMsg = String.format("Executor executes action [%s] error", actionString); logger.error(errMsg, t); - exceptionHook(errMsg, t); + if (hook != null) { + hook.apply(errMsg, t); + } } }); } - protected void exceptionHook(String errMsg, Throwable t) { - // for sub-class to override. - } - @Override public void close() throws Exception { if (executor != null) { - if (waitForTaskFinishOnClose) { + if (waitForTasksFinish) { executor.shutdown(); } else { executor.shutdownNow(); @@ -95,4 +119,38 @@ public void close() throws Exception { executor.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); } } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + public interface ExceptionHook { + void apply(String errMsg, Throwable t); + } + + /** + * Builder for {@link NonThrownExecutor}. + */ + public static class Builder { + private final Logger logger; + private ExceptionHook exceptionHook; + private boolean waitForTasksFinish = false; + + private Builder(Logger logger) { + this.logger = Objects.requireNonNull(logger); + } + + public NonThrownExecutor build() { + return new NonThrownExecutor(logger, exceptionHook, waitForTasksFinish); + } + + public Builder exceptionHook(ExceptionHook exceptionHook) { + this.exceptionHook = exceptionHook; + return this; + } + + public Builder waitForTasksFinish(boolean waitForTasksFinish) { + this.waitForTasksFinish = waitForTasksFinish; + return this; + } + } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/PayloadCreation.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/PayloadCreation.java index d10447f816d8e..64facf3b16f1b 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/PayloadCreation.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/PayloadCreation.java @@ -26,6 +26,7 @@ import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; import org.apache.avro.generic.GenericRecord; import org.apache.flink.configuration.Configuration; @@ -55,13 +56,14 @@ private PayloadCreation( } public static PayloadCreation instance(Configuration conf) throws Exception { - boolean shouldCombine = conf.getBoolean(FlinkOptions.INSERT_DROP_DUPS) + String preCombineField = OptionsResolver.getPreCombineField(conf); + boolean needCombine = conf.getBoolean(FlinkOptions.PRE_COMBINE) || WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)) == WriteOperationType.UPSERT; - String preCombineField = null; + boolean shouldCombine = needCombine && preCombineField != null; + final Class[] argTypes; final Constructor constructor; if (shouldCombine) { - preCombineField = conf.getString(FlinkOptions.PRECOMBINE_FIELD); argTypes = new Class[] {GenericRecord.class, Comparable.class}; } else { argTypes = new Class[] {Option.class}; diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java index 121118877b9eb..4f803662027c6 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java @@ -83,7 +83,8 @@ public static DataStreamSink bulkInsert(Configuration conf, RowType rowT operatorFactory) // follow the parallelism of upstream operators to avoid shuffle .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)) - .addSink(DummySink.INSTANCE); + .addSink(DummySink.INSTANCE) + .name("dummy"); } public static DataStreamSink append(Configuration conf, RowType rowType, DataStream dataStream) { @@ -93,7 +94,8 @@ public static DataStreamSink append(Configuration conf, RowType rowType, .transform("hoodie_append_write", TypeInformation.of(Object.class), operatorFactory) .uid("uid_hoodie_stream_write" + conf.getString(FlinkOptions.TABLE_NAME)) .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)) - .addSink(DummySink.INSTANCE); + .addSink(DummySink.INSTANCE) + .name("dummy"); } public static DataStream bootstrap( diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/TimeWait.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/TimeWait.java index 2ab0819abf10f..0441673c33d12 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/TimeWait.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/TimeWait.java @@ -20,6 +20,9 @@ import org.apache.hudi.exception.HoodieException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.util.Objects; import java.util.concurrent.TimeUnit; @@ -27,9 +30,12 @@ * Tool used for time waiting. */ public class TimeWait { - private final long timeout; // timeout in SECONDS - private final long interval; // interval in MILLISECONDS - private final String action; // action to report error message + private static final Logger LOG = LoggerFactory.getLogger(TimeWait.class); + + private final long timeout; // timeout in SECONDS + private final long interval; // interval in MILLISECONDS + private final String action; // action to report error message + private long waitingTime = 0L; private TimeWait(long timeout, long interval, String action) { @@ -61,17 +67,17 @@ public void waitFor() { * Builder. */ public static class Builder { - private long timeout; - private long interval; + private long timeout = 5 * 60 * 1000L; // default 5 minutes + private long interval = 1000; private String action; - public Builder() { - this.timeout = 3600; - this.interval = 500; + private Builder() { } public Builder timeout(long timeout) { - this.timeout = timeout; + if (timeout > 0) { + this.timeout = timeout; + } return this; } diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java b/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java index be02fc404a6f4..07383ef7fea5f 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java +++ b/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java @@ -22,13 +22,15 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.util.StreamerUtil; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.configuration.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; -import java.io.File; +import javax.annotation.Nullable; + import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -36,6 +38,7 @@ import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; /** * A file index which supports listing files efficiently through metadata table. @@ -46,10 +49,12 @@ public class FileIndex { private final Path path; private final HoodieMetadataConfig metadataConfig; private List partitionPaths; // cache of partition paths + private final boolean tableExists; private FileIndex(Path path, Configuration conf) { this.path = path; this.metadataConfig = metadataConfig(conf); + this.tableExists = StreamerUtil.tableExists(path.toString(), StreamerUtil.getHadoopConf()); } public static FileIndex instance(Path path, Configuration conf) { @@ -86,7 +91,7 @@ public List> getPartitions( } List> partitions = new ArrayList<>(); for (String partitionPath : partitionPaths) { - String[] paths = partitionPath.split(File.separator); + String[] paths = partitionPath.split(Path.SEPARATOR); Map partitionMapping = new LinkedHashMap<>(); if (hivePartition) { Arrays.stream(paths).forEach(p -> { @@ -109,6 +114,9 @@ public List> getPartitions( * Returns all the file statuses under the table base path. */ public FileStatus[] getFilesInPartitions() { + if (!tableExists) { + return new FileStatus[0]; + } String[] partitions = getOrBuildPartitionPaths().stream().map(p -> fullPartitionPath(path, p)).toArray(String[]::new); return FSUtils.getFilesInPartitions(HoodieFlinkEngineContext.DEFAULT, metadataConfig, path.toString(), partitions, "/tmp/") @@ -137,16 +145,35 @@ public void reset() { this.partitionPaths = null; } + // ------------------------------------------------------------------------- + // Getter/Setter + // ------------------------------------------------------------------------- + + /** + * Sets up explicit partition paths for pruning. + */ + public void setPartitionPaths(@Nullable Set partitionPaths) { + if (partitionPaths != null) { + this.partitionPaths = new ArrayList<>(partitionPaths); + } + } + // ------------------------------------------------------------------------- // Utilities // ------------------------------------------------------------------------- - private List getOrBuildPartitionPaths() { + /** + * Returns all the relative partition paths. + * + *

The partition paths are cached once invoked. + */ + public List getOrBuildPartitionPaths() { if (this.partitionPaths != null) { return this.partitionPaths; } - this.partitionPaths = FSUtils.getAllPartitionPaths(HoodieFlinkEngineContext.DEFAULT, - metadataConfig, path.toString()); + this.partitionPaths = this.tableExists + ? FSUtils.getAllPartitionPaths(HoodieFlinkEngineContext.DEFAULT, metadataConfig, path.toString()) + : Collections.emptyList(); return this.partitionPaths; } @@ -154,9 +181,7 @@ private static HoodieMetadataConfig metadataConfig(org.apache.flink.configuratio Properties properties = new Properties(); // set up metadata.enabled=true in table DDL to enable metadata listing - properties.put(HoodieMetadataConfig.ENABLE, conf.getBoolean(FlinkOptions.METADATA_ENABLED)); - properties.put(HoodieMetadataConfig.SYNC_ENABLE, conf.getBoolean(FlinkOptions.METADATA_ENABLED)); - properties.put(HoodieMetadataConfig.VALIDATE_ENABLE, false); + properties.put(HoodieMetadataConfig.ENABLE.key(), conf.getBoolean(FlinkOptions.METADATA_ENABLED)); return HoodieMetadataConfig.newBuilder().fromProperties(properties).build(); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java b/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java new file mode 100644 index 0000000000000..653e182bfad9c --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.InstantRange; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sink.partitioner.profile.WriteProfiles; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.apache.hadoop.fs.FileStatus; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import scala.Serializable; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; + +/** + * Utilities to generate incremental input splits {@link MergeOnReadInputSplit}. + * The input splits are used for streaming and incremental read. + * + *

How to generate the input splits: + *

    + *
  1. first fetch all the commit metadata for the incremental instants;
  2. + *
  3. resolve the incremental commit file paths;
  4. + *
  5. filter the full file paths by required partitions;
  6. + *
  7. use the file paths from #step 3 as the back-up of the filesystem view.
  8. + *
+ */ +public class IncrementalInputSplits implements Serializable { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(IncrementalInputSplits.class); + private final Configuration conf; + private final Path path; + private final long maxCompactionMemoryInBytes; + // for partition pruning + private final Set requiredPartitions; + + private IncrementalInputSplits( + Configuration conf, + Path path, + long maxCompactionMemoryInBytes, + @Nullable Set requiredPartitions) { + this.conf = conf; + this.path = path; + this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; + this.requiredPartitions = requiredPartitions; + } + + /** + * Returns the builder. + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Returns the incremental input splits. + * + * @param metaClient The meta client + * @param hadoopConf The hadoop configuration + * @return The list of incremental input splits or empty if there are no new instants + */ + public Result inputSplits( + HoodieTableMetaClient metaClient, + org.apache.hadoop.conf.Configuration hadoopConf) { + return inputSplits(metaClient, hadoopConf, null); + } + + /** + * Returns the incremental input splits. + * + * @param metaClient The meta client + * @param hadoopConf The hadoop configuration + * @param issuedInstant The last issued instant, only valid in streaming read + * @return The list of incremental input splits or empty if there are no new instants + */ + public Result inputSplits( + HoodieTableMetaClient metaClient, + org.apache.hadoop.conf.Configuration hadoopConf, + String issuedInstant) { + metaClient.reloadActiveTimeline(); + HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(); + if (commitTimeline.empty()) { + LOG.warn("No splits found for the table under path " + path); + return Result.EMPTY; + } + List instants = filterInstantsWithRange(commitTimeline, issuedInstant); + // get the latest instant that satisfies condition + final HoodieInstant instantToIssue = instants.size() == 0 ? null : instants.get(instants.size() - 1); + final InstantRange instantRange; + if (instantToIssue != null) { + if (issuedInstant != null) { + // the streaming reader may record the last issued instant, if the issued instant is present, + // the instant range should be: (issued instant, the latest instant]. + instantRange = InstantRange.getInstance(issuedInstant, instantToIssue.getTimestamp(), + InstantRange.RangeType.OPEN_CLOSE); + } else if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent()) { + // first time consume and has a start commit + final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT); + instantRange = startCommit.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) + ? null + : InstantRange.getInstance(startCommit, instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE); + } else { + // first time consume and no start commit, consumes the latest incremental data set. + instantRange = InstantRange.getInstance(instantToIssue.getTimestamp(), instantToIssue.getTimestamp(), + InstantRange.RangeType.CLOSE_CLOSE); + } + } else { + LOG.info("No new instant found for the table under path " + path + ", skip reading"); + return Result.EMPTY; + } + + String tableName = conf.getString(FlinkOptions.TABLE_NAME); + List activeMetadataList = instants.stream() + .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList()); + List archivedMetadataList = getArchivedMetadata(metaClient, instantRange, commitTimeline, tableName); + if (archivedMetadataList.size() > 0) { + LOG.warn("\n" + + "--------------------------------------------------------------------------------\n" + + "---------- caution: the reader has fall behind too much from the writer,\n" + + "---------- tweak 'read.tasks' option to add parallelism of read tasks.\n" + + "--------------------------------------------------------------------------------"); + } + List metadataList = archivedMetadataList.size() > 0 + // IMPORTANT: the merged metadata list must be in ascending order by instant time + ? mergeList(archivedMetadataList, activeMetadataList) + : activeMetadataList; + + Set writePartitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList); + // apply partition push down + if (this.requiredPartitions != null) { + writePartitions = writePartitions.stream() + .filter(this.requiredPartitions::contains).collect(Collectors.toSet()); + } + FileStatus[] fileStatuses = WriteProfiles.getWritePathsOfInstants(path, hadoopConf, metadataList, metaClient.getTableType()); + if (fileStatuses.length == 0) { + LOG.warn("No files found for reading in user provided path."); + return Result.EMPTY; + } + + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses); + final String endInstant = instantToIssue.getTimestamp(); + final AtomicInteger cnt = new AtomicInteger(0); + final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE); + List inputSplits = writePartitions.stream() + .map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant) + .map(fileSlice -> { + Option> logPaths = Option.ofNullable(fileSlice.getLogFiles() + .sorted(HoodieLogFile.getLogFileComparator()) + .map(logFile -> logFile.getPath().toString()) + .collect(Collectors.toList())); + String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); + return new MergeOnReadInputSplit(cnt.getAndAdd(1), + basePath, logPaths, endInstant, + metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange); + }).collect(Collectors.toList())) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + return Result.instance(inputSplits, endInstant); + } + + /** + * Returns the archived metadata in case the reader consumes untimely or it wants + * to read from the earliest. + * + *

Note: should improve it with metadata table when the metadata table is stable enough. + * + * @param metaClient The meta client + * @param instantRange The instant range to filter the timeline instants + * @param commitTimeline The commit timeline + * @param tableName The table name + * @return the list of archived metadata, or empty if there is no need to read the archived timeline + */ + private List getArchivedMetadata( + HoodieTableMetaClient metaClient, + InstantRange instantRange, + HoodieTimeline commitTimeline, + String tableName) { + if (instantRange == null || commitTimeline.isBeforeTimelineStarts(instantRange.getStartInstant())) { + // read the archived metadata if: + // 1. the start commit is 'earliest'; + // 2. the start instant is archived. + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); + HoodieTimeline archivedCompleteTimeline = archivedTimeline.getCommitsTimeline().filterCompletedInstants(); + if (!archivedCompleteTimeline.empty()) { + final String endTs = archivedCompleteTimeline.lastInstant().get().getTimestamp(); + Stream instantStream = archivedCompleteTimeline.getInstants(); + if (instantRange != null) { + archivedTimeline.loadInstantDetailsInMemory(instantRange.getStartInstant(), endTs); + instantStream = instantStream.filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, instantRange.getStartInstant())); + } else { + final String startTs = archivedCompleteTimeline.firstInstant().get().getTimestamp(); + archivedTimeline.loadInstantDetailsInMemory(startTs, endTs); + } + return instantStream + .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, archivedTimeline)).collect(Collectors.toList()); + } + } + return Collections.emptyList(); + } + + /** + * Returns the instants with a given issuedInstant to start from. + * + * @param commitTimeline The completed commits timeline + * @param issuedInstant The last issued instant that has already been delivered to downstream + * @return the filtered hoodie instants + */ + private List filterInstantsWithRange( + HoodieTimeline commitTimeline, + final String issuedInstant) { + HoodieTimeline completedTimeline = commitTimeline.filterCompletedInstants(); + if (issuedInstant != null) { + // returns early for streaming mode + return completedTimeline.getInstants() + .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN, issuedInstant)) + .collect(Collectors.toList()); + } + + Stream instantStream = completedTimeline.getInstants(); + + if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent() + && !this.conf.get(FlinkOptions.READ_START_COMMIT).equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST)) { + final String startCommit = this.conf.get(FlinkOptions.READ_START_COMMIT); + instantStream = instantStream + .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, startCommit)); + } + if (this.conf.getOptional(FlinkOptions.READ_END_COMMIT).isPresent()) { + final String endCommit = this.conf.get(FlinkOptions.READ_END_COMMIT); + instantStream = instantStream.filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), LESSER_THAN_OR_EQUALS, endCommit)); + } + return instantStream.collect(Collectors.toList()); + } + + private static List mergeList(List list1, List list2) { + List merged = new ArrayList<>(list1); + merged.addAll(list2); + return merged; + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Represents a result of calling {@link #inputSplits}. + */ + public static class Result { + private final List inputSplits; // input splits + private final String endInstant; // end instant to consume to + + public static final Result EMPTY = instance(Collections.emptyList(), ""); + + public boolean isEmpty() { + return this.inputSplits.size() == 0; + } + + public List getInputSplits() { + return this.inputSplits; + } + + public String getEndInstant() { + return this.endInstant; + } + + private Result(List inputSplits, String endInstant) { + this.inputSplits = inputSplits; + this.endInstant = endInstant; + } + + public static Result instance(List inputSplits, String endInstant) { + return new Result(inputSplits, endInstant); + } + } + + /** + * Builder for {@link IncrementalInputSplits}. + */ + public static class Builder { + private Configuration conf; + private Path path; + private long maxCompactionMemoryInBytes; + // for partition pruning + private Set requiredPartitions; + + public Builder() { + } + + public Builder conf(Configuration conf) { + this.conf = conf; + return this; + } + + public Builder path(Path path) { + this.path = path; + return this; + } + + public Builder maxCompactionMemoryInBytes(long maxCompactionMemoryInBytes) { + this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; + return this; + } + + public Builder requiredPartitions(@Nullable Set requiredPartitions) { + this.requiredPartitions = requiredPartitions; + return this; + } + + public IncrementalInputSplits build() { + return new IncrementalInputSplits(Objects.requireNonNull(this.conf), Objects.requireNonNull(this.path), + this.maxCompactionMemoryInBytes, this.requiredPartitions); + } + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java b/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java index ec56903412cfa..c7bcc399ebc9d 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java @@ -18,19 +18,9 @@ package org.apache.hudi.source; -import org.apache.hudi.common.model.BaseFile; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.InstantRange; -import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.HoodieTableFileSystemView; -import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.configuration.FlinkOptions; -import org.apache.hudi.sink.partitioner.profile.WriteProfiles; import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; import org.apache.hudi.util.StreamerUtil; @@ -45,24 +35,15 @@ import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.source.RichSourceFunction; import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.hadoop.fs.FileStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; -import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS; /** * This is the single (non-parallel) monitoring task which takes a {@link MergeOnReadInputSplit} @@ -112,21 +93,21 @@ public class StreamReadMonitoringFunction private HoodieTableMetaClient metaClient; - private final long maxCompactionMemoryInBytes; - - // for partition pruning - private final Set requiredPartitionPaths; + private final IncrementalInputSplits incrementalInputSplits; public StreamReadMonitoringFunction( Configuration conf, Path path, long maxCompactionMemoryInBytes, - Set requiredPartitionPaths) { + @Nullable Set requiredPartitionPaths) { this.conf = conf; this.path = path; this.interval = conf.getInteger(FlinkOptions.READ_STREAMING_CHECK_INTERVAL); - this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; - this.requiredPartitionPaths = requiredPartitionPaths; + this.incrementalInputSplits = IncrementalInputSplits.builder() + .conf(conf) + .path(path) + .maxCompactionMemoryInBytes(maxCompactionMemoryInBytes) + .requiredPartitions(requiredPartitionPaths).build(); } @Override @@ -208,86 +189,23 @@ public void monitorDirAndForwardSplits(SourceContext cont // table does not exist return; } - metaClient.reloadActiveTimeline(); - HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(); - if (commitTimeline.empty()) { - LOG.warn("No splits found for the table under path " + path); - return; - } - List instants = filterInstantsWithStart(commitTimeline, this.issuedInstant); - // get the latest instant that satisfies condition - final HoodieInstant instantToIssue = instants.size() == 0 ? null : instants.get(instants.size() - 1); - final InstantRange instantRange; - if (instantToIssue != null) { - if (this.issuedInstant != null) { - // had already consumed an instant - instantRange = InstantRange.getInstance(this.issuedInstant, instantToIssue.getTimestamp(), - InstantRange.RangeType.OPEN_CLOSE); - } else if (this.conf.getOptional(FlinkOptions.READ_STREAMING_START_COMMIT).isPresent()) { - // first time consume and has a start commit - final String specifiedStart = this.conf.getString(FlinkOptions.READ_STREAMING_START_COMMIT); - instantRange = specifiedStart.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) - ? null - : InstantRange.getInstance(specifiedStart, instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE); - } else { - // first time consume and no start commit, consumes the latest incremental data set. - instantRange = InstantRange.getInstance(instantToIssue.getTimestamp(), instantToIssue.getTimestamp(), - InstantRange.RangeType.CLOSE_CLOSE); - } - } else { - LOG.info("No new instant found for the table under path " + path + ", skip reading"); - return; - } - // generate input split: - // 1. first fetch all the commit metadata for the incremental instants; - // 2. filter the relative partition paths - // 3. filter the full file paths - // 4. use the file paths from #step 3 as the back-up of the filesystem view - - String tableName = conf.getString(FlinkOptions.TABLE_NAME); - List activeMetadataList = instants.stream() - .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList()); - List archivedMetadataList = getArchivedMetadata(instantRange, commitTimeline, tableName); - List metadataList = archivedMetadataList.size() > 0 - ? mergeList(activeMetadataList, archivedMetadataList) - : activeMetadataList; - - Set writePartitions = getWritePartitionPaths(metadataList); - // apply partition push down - if (this.requiredPartitionPaths.size() > 0) { - writePartitions = writePartitions.stream() - .filter(this.requiredPartitionPaths::contains).collect(Collectors.toSet()); - } - FileStatus[] fileStatuses = WriteProfiles.getWritePathsOfInstants(path, hadoopConf, metadataList); - if (fileStatuses.length == 0) { - LOG.warn("No files found for reading in user provided path."); + IncrementalInputSplits.Result result = + incrementalInputSplits.inputSplits(metaClient, this.hadoopConf, this.issuedInstant); + if (result.isEmpty()) { + // no new instants, returns early return; } - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses); - final String commitToIssue = instantToIssue.getTimestamp(); - final AtomicInteger cnt = new AtomicInteger(0); - final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE); - List inputSplits = writePartitions.stream() - .map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, commitToIssue) - .map(fileSlice -> { - Option> logPaths = Option.ofNullable(fileSlice.getLogFiles() - .sorted(HoodieLogFile.getLogFileComparator()) - .map(logFile -> logFile.getPath().toString()) - .collect(Collectors.toList())); - String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); - return new MergeOnReadInputSplit(cnt.getAndAdd(1), - basePath, logPaths, commitToIssue, - metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange); - }).collect(Collectors.toList())) - .flatMap(Collection::stream) - .collect(Collectors.toList()); - - for (MergeOnReadInputSplit split : inputSplits) { + for (MergeOnReadInputSplit split : result.getInputSplits()) { context.collect(split); } // update the issues instant time - this.issuedInstant = commitToIssue; + this.issuedInstant = result.getEndInstant(); + LOG.info("\n" + + "------------------------------------------------------------\n" + + "---------- consumed to instant: {}\n" + + "------------------------------------------------------------", + this.issuedInstant); } @Override @@ -331,87 +249,4 @@ public void snapshotState(FunctionSnapshotContext context) throws Exception { this.instantState.add(this.issuedInstant); } } - - /** - * Returns the archived metadata in case the reader consumes untimely or it wants - * to read from the earliest. - * - *

Note: should improve it with metadata table when the metadata table is stable enough. - * - * @param instantRange The instant range to filter the timeline instants - * @param commitTimeline The commit timeline - * @param tableName The table name - * @return the list of archived metadata, or empty if there is no need to read the archived timeline - */ - private List getArchivedMetadata( - InstantRange instantRange, - HoodieTimeline commitTimeline, - String tableName) { - if (instantRange == null || commitTimeline.isBeforeTimelineStarts(instantRange.getStartInstant())) { - // read the archived metadata if: - // 1. the start commit is 'earliest'; - // 2. the start instant is archived. - HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); - HoodieTimeline archivedCompleteTimeline = archivedTimeline.getCommitsTimeline().filterCompletedInstants(); - if (!archivedCompleteTimeline.empty()) { - final String endTs = archivedCompleteTimeline.lastInstant().get().getTimestamp(); - Stream instantStream = archivedCompleteTimeline.getInstants(); - if (instantRange != null) { - archivedTimeline.loadInstantDetailsInMemory(instantRange.getStartInstant(), endTs); - instantStream = instantStream.filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, instantRange.getStartInstant())); - } else { - final String startTs = archivedCompleteTimeline.firstInstant().get().getTimestamp(); - archivedTimeline.loadInstantDetailsInMemory(startTs, endTs); - } - return instantStream - .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, archivedTimeline)).collect(Collectors.toList()); - } - } - return Collections.emptyList(); - } - - /** - * Returns the instants with a given issuedInstant to start from. - * - * @param commitTimeline The completed commits timeline - * @param issuedInstant The last issued instant that has already been delivered to downstream - * @return the filtered hoodie instants - */ - private List filterInstantsWithStart( - HoodieTimeline commitTimeline, - final String issuedInstant) { - HoodieTimeline completedTimeline = commitTimeline.filterCompletedInstants(); - if (issuedInstant != null) { - return completedTimeline.getInstants() - .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN, issuedInstant)) - .collect(Collectors.toList()); - } else if (this.conf.getOptional(FlinkOptions.READ_STREAMING_START_COMMIT).isPresent() - && !this.conf.get(FlinkOptions.READ_STREAMING_START_COMMIT).equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST)) { - String definedStartCommit = this.conf.get(FlinkOptions.READ_STREAMING_START_COMMIT); - return completedTimeline.getInstants() - .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, definedStartCommit)) - .collect(Collectors.toList()); - } else { - return completedTimeline.getInstants().collect(Collectors.toList()); - } - } - - /** - * Returns all the incremental write partition paths as a set with the given commits metadata. - * - * @param metadataList The commits metadata - * @return the partition path set - */ - private Set getWritePartitionPaths(List metadataList) { - return metadataList.stream() - .map(HoodieCommitMetadata::getWritePartitionPaths) - .flatMap(Collection::stream) - .collect(Collectors.toSet()); - } - - private static List mergeList(List list1, List list2) { - List merged = new ArrayList<>(list1); - merged.addAll(list2); - return merged; - } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java b/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java index e2f5f7b95137c..013043384d3b5 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java @@ -64,6 +64,8 @@ public class StreamReadOperator extends AbstractStreamOperator private static final Logger LOG = LoggerFactory.getLogger(StreamReadOperator.class); + private static final int MINI_BATCH_SIZE = 1000; + // It's the same thread that runs this operator and checkpoint actions. Use this executor to schedule only // splits for subsequent reading, so that a new checkpoint could be triggered without blocking a long time // for exhausting all scheduled split reading tasks. @@ -74,6 +76,7 @@ public class StreamReadOperator extends AbstractStreamOperator private transient SourceFunction.SourceContext sourceContext; private transient ListState inputSplitsState; + private transient Queue splits; // Splits are read by the same thread that calls #processElement. Each read task is submitted to that thread by adding @@ -146,31 +149,56 @@ private void enqueueProcessSplits() { } private void processSplits() throws IOException { - MergeOnReadInputSplit split = splits.poll(); + MergeOnReadInputSplit split = splits.peek(); if (split == null) { currentSplitState = SplitState.IDLE; return; } - // This log is important to indicate the consuming process, there is only one log message for one data bucket. - LOG.info("Processing input split : {}", split); - - try { + // 1. open a fresh new input split and start reading as mini-batch + // 2. if the input split has remaining records to read, switches to another runnable to handle + // 3. if the input split reads to the end, close the format and remove the split from the queue #splits + // 4. for each runnable, reads at most #MINI_BATCH_SIZE number of records + if (format.isClosed()) { + // This log is important to indicate the consuming process, + // there is only one log message for one data bucket. + LOG.info("Processing input split : {}", split); format.open(split); - RowData nextElement = null; - while (!format.reachedEnd()) { - nextElement = format.nextRecord(nextElement); - sourceContext.collect(nextElement); - } + } + try { + consumeAsMiniBatch(split); } finally { currentSplitState = SplitState.IDLE; - format.close(); } // Re-schedule to process the next split. enqueueProcessSplits(); } + /** + * Consumes at most {@link #MINI_BATCH_SIZE} number of records + * for the given input split {@code split}. + * + *

Note: close the input format and remove the input split for the queue {@link #splits} + * if the split reads to the end. + * + * @param split The input split + */ + private void consumeAsMiniBatch(MergeOnReadInputSplit split) throws IOException { + for (int i = 0; i < MINI_BATCH_SIZE; i++) { + if (!format.reachedEnd()) { + sourceContext.collect(format.nextRecord(null)); + split.consume(); + } else { + // close the input format + format.close(); + // remove the split + splits.poll(); + break; + } + } + } + @Override public void processWatermark(Watermark mark) { // we do nothing because we emit our own watermarks if needed. diff --git a/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java b/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java index 898ba88fd021e..de2218acb0b94 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java +++ b/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java @@ -35,6 +35,8 @@ import java.util.List; import java.util.Map; +import static org.apache.hudi.configuration.FlinkOptions.PARTITION_FORMAT_DAY; + /** * Configurations for Hoodie Flink streamer. */ @@ -69,8 +71,9 @@ public class FlinkStreamerConfig extends Configuration { @Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ.", required = true) public String tableType; - @Parameter(names = {"--insert-dedup"}, description = "Whether to deduplicate for INSERT operation, if disabled, writes the base files directly.", required = true) - public Boolean insertDedup = true; + @Parameter(names = {"--insert-cluster"}, description = "Whether to merge small files for insert mode, " + + "if true, the write throughput will decrease because the read/write of existing small file, default false.") + public Boolean insertCluster = false; @Parameter(names = {"--props"}, description = "Path to properties file on localfs or dfs, with configurations for " + "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are " @@ -112,7 +115,7 @@ public class FlinkStreamerConfig extends Configuration { @Parameter(names = {"--filter-dupes"}, description = "Should duplicate records from source be dropped/filtered out before insert/bulk-insert.") - public Boolean filterDupes = false; + public Boolean preCombine = false; @Parameter(names = {"--commit-on-errors"}, description = "Commit even when some records failed to be written.") public Boolean commitOnErrors = false; @@ -123,6 +126,30 @@ public class FlinkStreamerConfig extends Configuration { + "writing. Default : Not set. Pass a comma-separated list of subclass names to chain the transformations.") public List transformerClassNames = null; + @Parameter(names = {"--metadata-enabled"}, description = "Enable the internal metadata table which serves table metadata like level file listings, default false.") + public Boolean metadataEnabled = false; + + @Parameter(names = {"--metadata-compaction-delta_commits"}, description = "Max delta commits for metadata table to trigger compaction, default 10.") + public Integer metadataCompactionDeltaCommits = 10; + + @Parameter(names = {"--write-partition-format"}, description = "Partition path format, default is 'yyyyMMdd'.") + public String writePartitionFormat = PARTITION_FORMAT_DAY; + + @Parameter(names = {"--write-rate-limit"}, description = "Write record rate limit per second to prevent traffic jitter and improve stability, default 0 (no limit).") + public Long writeRateLimit = 0L; + + @Parameter(names = {"--write-parquet-block-size"}, description = "Parquet RowGroup size. It's recommended to make this large enough that scan costs can be" + + " amortized by packing enough column values into a single row group.") + public Integer writeParquetBlockSize = 120; + + @Parameter(names = {"--write-parquet-max-file-size"}, description = "Target size for parquet files produced by Hudi write phases. " + + "For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.") + public Integer writeParquetMaxFileSize = 120; + + @Parameter(names = {"--parquet-page-size"}, description = "Parquet page size. Page is the unit of read within a parquet file. " + + "Within a block, pages are compressed separately.") + public Integer parquetPageSize = 1; + /** * Flink checkpoint interval. */ @@ -143,18 +170,18 @@ public class FlinkStreamerConfig extends Configuration { @Parameter(names = {"--partition-default-name"}, description = "The default partition name in case the dynamic partition column value is null/empty string") - public String partitionDefaultName = "__DEFAULT_PARTITION__"; + public String partitionDefaultName = "default"; @Parameter(names = {"--index-bootstrap-enabled"}, description = "Whether to bootstrap the index state from existing hoodie table, default false") public Boolean indexBootstrapEnabled = false; - @Parameter(names = {"--index-state-ttl"}, description = "Index state ttl in days, default 1.5 day") - public Double indexStateTtl = 1.5D; + @Parameter(names = {"--index-state-ttl"}, description = "Index state ttl in days, default stores the index permanently") + public Double indexStateTtl = 0D; @Parameter(names = {"--index-global-enabled"}, description = "Whether to update index for the old partition path " - + "if same key record with different partition path came in, default false") - public Boolean indexGlobalEnabled = false; + + "if same key record with different partition path came in, default true") + public Boolean indexGlobalEnabled = true; @Parameter(names = {"--index-partition-regex"}, description = "Whether to load partitions in state if partition path matching, default *") @@ -184,8 +211,8 @@ public class FlinkStreamerConfig extends Configuration { public Double writeTaskMaxSize = 1024D; @Parameter(names = {"--write-batch-size"}, - description = "Batch buffer size in MB to flush data into the underneath filesystem, default 64MB") - public Double writeBatchSize = 64D; + description = "Batch buffer size in MB to flush data into the underneath filesystem, default 256MB") + public Double writeBatchSize = 256D; @Parameter(names = {"--write-log-block-size"}, description = "Max log block size in MB for log file, default 128MB") public Integer writeLogBlockSize = 128; @@ -220,8 +247,8 @@ public class FlinkStreamerConfig extends Configuration { @Parameter(names = {"--compaction-max-memory"}, description = "Max memory in MB for compaction spillable map, default 100MB") public Integer compactionMaxMemory = 100; - @Parameter(names = {"--compaction-target-io"}, description = "Target IO per compaction (both read and write), default 5 GB") - public Long compactionTargetIo = 5120L; + @Parameter(names = {"--compaction-target-io"}, description = "Target IO per compaction (both read and write), default 500 GB") + public Long compactionTargetIo = 512000L; @Parameter(names = {"--clean-async-enabled"}, description = "Whether to cleanup the old commits immediately on new commits, enabled by default") public Boolean cleanAsyncEnabled = true; @@ -308,16 +335,23 @@ public static org.apache.flink.configuration.Configuration toFlinkConfig(FlinkSt conf.setString(FlinkOptions.TABLE_NAME, config.targetTableName); // copy_on_write works same as COPY_ON_WRITE conf.setString(FlinkOptions.TABLE_TYPE, config.tableType.toUpperCase()); - conf.setBoolean(FlinkOptions.INSERT_DEDUP, config.insertDedup); + conf.setBoolean(FlinkOptions.INSERT_CLUSTER, config.insertCluster); conf.setString(FlinkOptions.OPERATION, config.operation.value()); conf.setString(FlinkOptions.PRECOMBINE_FIELD, config.sourceOrderingField); conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, config.payloadClassName); - conf.setBoolean(FlinkOptions.INSERT_DROP_DUPS, config.filterDupes); + conf.setBoolean(FlinkOptions.PRE_COMBINE, config.preCombine); conf.setInteger(FlinkOptions.RETRY_TIMES, Integer.parseInt(config.instantRetryTimes)); conf.setLong(FlinkOptions.RETRY_INTERVAL_MS, Long.parseLong(config.instantRetryInterval)); conf.setBoolean(FlinkOptions.IGNORE_FAILED, config.commitOnErrors); conf.setString(FlinkOptions.RECORD_KEY_FIELD, config.recordKeyField); conf.setString(FlinkOptions.PARTITION_PATH_FIELD, config.partitionPathField); + conf.setBoolean(FlinkOptions.METADATA_ENABLED, config.metadataEnabled); + conf.setInteger(FlinkOptions.METADATA_COMPACTION_DELTA_COMMITS, config.metadataCompactionDeltaCommits); + conf.setString(FlinkOptions.PARTITION_FORMAT, config.writePartitionFormat); + conf.setLong(FlinkOptions.WRITE_RATE_LIMIT, config.writeRateLimit); + conf.setInteger(FlinkOptions.WRITE_PARQUET_BLOCK_SIZE, config.writeParquetBlockSize); + conf.setInteger(FlinkOptions.WRITE_PARQUET_MAX_FILE_SIZE, config.writeParquetMaxFileSize); + conf.setInteger(FlinkOptions.WRITE_PARQUET_PAGE_SIZE, config.parquetPageSize); if (!StringUtils.isNullOrEmpty(config.keygenClass)) { conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, config.keygenClass); } else { diff --git a/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java b/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java index 077633ee90e53..bb545ad896ac9 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java +++ b/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java @@ -41,8 +41,9 @@ import java.util.Properties; /** - * An Utility which can incrementally consume data from Kafka and apply it to the target table. - * currently, it only supports COW table and insert, upsert operation. + * A utility which can incrementally consume data from Kafka and apply it to the target table. + * It has the similar functionality with SQL data source except that the source is bind to Kafka + * and the format is bind to JSON. */ public class HoodieFlinkStreamer { public static void main(String[] args) throws Exception { diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java index cf1cbd58f8adc..5299551fccd38 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java @@ -18,7 +18,10 @@ package org.apache.hudi.table; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.model.EventTimeAvroPayload; import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.exception.HoodieValidationException; import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hudi.keygen.ComplexAvroKeyGenerator; @@ -26,7 +29,6 @@ import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; import org.apache.hudi.util.AvroSchemaConverter; import org.apache.hudi.util.DataTypeUtils; -import org.apache.hudi.util.StreamerUtil; import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.Configuration; @@ -38,7 +40,6 @@ import org.apache.flink.table.connector.source.DynamicTableSource; import org.apache.flink.table.factories.DynamicTableSinkFactory; import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.factories.FactoryUtil; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.hadoop.fs.Path; @@ -60,10 +61,7 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab @Override public DynamicTableSource createDynamicTableSource(Context context) { - FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); - helper.validate(); - - Configuration conf = (Configuration) helper.getOptions(); + Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions()); ResolvedSchema schema = context.getCatalogTable().getResolvedSchema(); sanityCheck(conf, schema); setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema); @@ -117,25 +115,46 @@ private void sanityCheck(Configuration conf, ResolvedSchema schema) { // validate record key in pk absence. if (!schema.getPrimaryKey().isPresent()) { - Arrays.stream(conf.get(FlinkOptions.RECORD_KEY_FIELD).split(",")) + String[] recordKeys = conf.get(FlinkOptions.RECORD_KEY_FIELD).split(","); + if (recordKeys.length == 1 + && FlinkOptions.RECORD_KEY_FIELD.defaultValue().equals(recordKeys[0]) + && !fields.contains(recordKeys[0])) { + throw new HoodieValidationException("Primary key definition is required, use either PRIMARY KEY syntax " + + "or option '" + FlinkOptions.RECORD_KEY_FIELD.key() + "' to specify."); + } + + Arrays.stream(recordKeys) .filter(field -> !fields.contains(field)) .findAny() .ifPresent(f -> { - throw new ValidationException("Field '" + f + "' does not exist in the table schema." - + "Please define primary key or modify 'hoodie.datasource.write.recordkey.field' option."); + throw new HoodieValidationException("Field '" + f + "' specified in option " + + "'" + FlinkOptions.RECORD_KEY_FIELD.key() + "' does not exist in the table schema."); }); } // validate pre_combine key String preCombineField = conf.get(FlinkOptions.PRECOMBINE_FIELD); if (!fields.contains(preCombineField)) { - throw new ValidationException("Field " + preCombineField + " does not exist in the table schema." - + "Please check 'write.precombine.field' option."); + if (OptionsResolver.isDefaultHoodieRecordPayloadClazz(conf)) { + throw new HoodieValidationException("Option '" + FlinkOptions.PRECOMBINE_FIELD.key() + + "' is required for payload class: " + DefaultHoodieRecordPayload.class.getName()); + } + if (preCombineField.equals(FlinkOptions.PRECOMBINE_FIELD.defaultValue())) { + conf.setString(FlinkOptions.PRECOMBINE_FIELD, FlinkOptions.NO_PRE_COMBINE); + } else { + throw new HoodieValidationException("Field " + preCombineField + " does not exist in the table schema." + + "Please check '" + FlinkOptions.PRECOMBINE_FIELD.key() + "' option."); + } + } else if (FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.PAYLOAD_CLASS_NAME)) { + // if precombine field is specified but payload clazz is default, + // use DefaultHoodieRecordPayload to make sure the precombine field is always taken for + // comparing. + conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, EventTimeAvroPayload.class.getName()); } } /** - * Setup the config options based on the table definition, for e.g the table name, primary key. + * Sets up the config options based on the table definition, for e.g the table name, primary key. * * @param conf The configuration to setup * @param tableName The table name @@ -155,6 +174,10 @@ private static void setupConfOptions( setupCompactionOptions(conf); // hive options setupHiveOptions(conf); + // read options + setupReadOptions(conf); + // write options + setupWriteOptions(conf); // infer avro schema from physical DDL schema inferAvroSchema(conf, schema.toPhysicalRowDataType().notNull().getLogicalType()); } @@ -247,17 +270,6 @@ private static void setupCompactionOptions(Configuration conf) { conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, commitsToRetain + 10); conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, commitsToRetain + 20); } - if (conf.getBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED) - && !conf.getBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED) - && FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.COMPACTION_TARGET_IO)) { - // if compaction schedule is on, tweak the target io to 500GB - conf.setLong(FlinkOptions.COMPACTION_TARGET_IO, 500 * 1024L); - } - if (StreamerUtil.allowDuplicateInserts(conf)) { - // no need for compaction if insert duplicates is allowed - conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); - conf.setBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED, false); - } } /** @@ -270,6 +282,26 @@ private static void setupHiveOptions(Configuration conf) { } } + /** + * Sets up the read options from the table definition. + */ + private static void setupReadOptions(Configuration conf) { + if (!conf.getBoolean(FlinkOptions.READ_AS_STREAMING) + && (conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent() || conf.getOptional(FlinkOptions.READ_END_COMMIT).isPresent())) { + conf.setString(FlinkOptions.QUERY_TYPE, FlinkOptions.QUERY_TYPE_INCREMENTAL); + } + } + + /** + * Sets up the write options from the table definition. + */ + private static void setupWriteOptions(Configuration conf) { + if (FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.OPERATION) + && OptionsResolver.isCowTable(conf)) { + conf.setBoolean(FlinkOptions.PRE_COMBINE, true); + } + } + /** * Inferences the deserialization Avro schema from the table schema (e.g. the DDL) * if both options {@link FlinkOptions#SOURCE_AVRO_SCHEMA_PATH} and diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java index 2fdd0fd682d88..e5f097c010072 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.sink.utils.Pipelines; import org.apache.hudi.util.ChangelogModes; import org.apache.hudi.util.StreamerUtil; @@ -72,18 +73,18 @@ public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { // bulk_insert mode final String writeOperation = this.conf.get(FlinkOptions.OPERATION); if (WriteOperationType.fromValue(writeOperation) == WriteOperationType.BULK_INSERT) { - return Pipelines.bulkInsert(conf, rowType, dataStream); + return context.isBounded() ? Pipelines.bulkInsert(conf, rowType, dataStream) : Pipelines.append(conf, rowType, dataStream); } - // default parallelism - int parallelism = dataStream.getExecutionConfig().getParallelism(); - - DataStream pipeline; // Append mode - if (StreamerUtil.allowDuplicateInserts(conf)) { + if (OptionsResolver.isAppendMode(conf)) { return Pipelines.append(conf, rowType, dataStream); } + // default parallelism + int parallelism = dataStream.getExecutionConfig().getParallelism(); + DataStream pipeline; + // bootstrap final DataStream hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, parallelism, dataStream, context.isBounded()); // write pipeline diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java index 43743fc64319c..f0dbffd4732fe 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java @@ -19,7 +19,6 @@ package org.apache.hudi.table; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieTableType; @@ -31,6 +30,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.HoodieROTablePathFilter; import org.apache.hudi.source.FileIndex; +import org.apache.hudi.source.IncrementalInputSplits; import org.apache.hudi.source.StreamReadMonitoringFunction; import org.apache.hudi.source.StreamReadOperator; import org.apache.hudi.table.format.FilePathUtils; @@ -40,6 +40,7 @@ import org.apache.hudi.table.format.mor.MergeOnReadTableState; import org.apache.hudi.util.AvroSchemaConverter; import org.apache.hudi.util.ChangelogModes; +import org.apache.hudi.util.InputFormats; import org.apache.hudi.util.StreamerUtil; import org.apache.avro.Schema; @@ -48,7 +49,6 @@ import org.apache.flink.api.common.io.FilePathFilter; import org.apache.flink.api.common.io.InputFormat; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.io.CollectionInputFormat; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; @@ -180,17 +180,15 @@ public DataStream produceDataStream(StreamExecutionEnvironment execEnv) conf, FilePathUtils.toFlinkPath(path), maxCompactionMemoryInBytes, getRequiredPartitionPaths()); InputFormat inputFormat = getInputFormat(true); OneInputStreamOperatorFactory factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat); - SingleOutputStreamOperator source = execEnv.addSource(monitoringFunction, "streaming_source") - .uid("uid_streaming_source_" + conf.getString(FlinkOptions.TABLE_NAME)) + SingleOutputStreamOperator source = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor")) .setParallelism(1) .transform("split_reader", typeInfo, factory) - .uid("uid_split_reader_" + conf.getString(FlinkOptions.TABLE_NAME)) .setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); return new DataStreamSource<>(source); } else { InputFormatSourceFunction func = new InputFormatSourceFunction<>(getInputFormat(), typeInfo); DataStreamSource source = execEnv.addSource(func, asSummaryString(), typeInfo); - return source.name("bounded_source").setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); + return source.name(getSourceOperatorName("bounded_source")).setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); } } }; @@ -220,7 +218,7 @@ public String asSummaryString() { public Result applyFilters(List filters) { this.filters = new ArrayList<>(filters); // refuse all the filters now - return Result.of(Collections.emptyList(), new ArrayList<>(filters)); + return SupportsFilterPushDown.Result.of(Collections.emptyList(), new ArrayList<>(filters)); } @Override @@ -256,8 +254,8 @@ private DataType getProducedDataType() { DataType[] schemaTypes = this.schema.getColumnDataTypes().toArray(new DataType[0]); return DataTypes.ROW(Arrays.stream(this.requiredPos) - .mapToObj(i -> DataTypes.FIELD(schemaFieldNames[i], schemaTypes[i])) - .toArray(DataTypes.Field[]::new)) + .mapToObj(i -> DataTypes.FIELD(schemaFieldNames[i], schemaTypes[i])) + .toArray(DataTypes.Field[]::new)) .bridgedTo(RowData.class); } @@ -268,16 +266,36 @@ private List> getOrFetchPartitions() { return requiredPartitions; } + private String getSourceOperatorName(String operatorName) { + String[] schemaFieldNames = this.schema.getColumnNames().toArray(new String[0]); + List fields = Arrays.stream(this.requiredPos) + .mapToObj(i -> schemaFieldNames[i]) + .collect(Collectors.toList()); + StringBuilder sb = new StringBuilder(); + sb.append(operatorName) + .append("(") + .append("table=").append(Collections.singletonList(conf.getString(FlinkOptions.TABLE_NAME))) + .append(", ") + .append("fields=").append(fields) + .append(")"); + return sb.toString(); + } + + @Nullable private Set getRequiredPartitionPaths() { if (this.requiredPartitions == null) { - return Collections.emptySet(); + // returns null for non partition pruning + return null; } return FilePathUtils.toRelativePartitionPaths(this.partitionKeys, this.requiredPartitions, conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING)); } - private List buildFileIndex(Path[] paths) { - if (paths.length == 0) { + private List buildFileIndex() { + Set requiredPartitionPaths = getRequiredPartitionPaths(); + fileIndex.setPartitionPaths(requiredPartitionPaths); + List relPartitionPaths = fileIndex.getOrBuildPartitionPaths(); + if (relPartitionPaths.size() == 0) { return Collections.emptyList(); } FileStatus[] fileStatuses = fileIndex.getFilesInPartitions(); @@ -292,19 +310,17 @@ private List buildFileIndex(Path[] paths) { final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE); final AtomicInteger cnt = new AtomicInteger(0); // generates one input split for each file group - return Arrays.stream(paths).map(partitionPath -> { - String relPartitionPath = FSUtils.getRelativePartitionPath(path, partitionPath); - return fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, latestCommit) - .map(fileSlice -> { - String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); - Option> logPaths = Option.ofNullable(fileSlice.getLogFiles() - .sorted(HoodieLogFile.getLogFileComparator()) - .map(logFile -> logFile.getPath().toString()) - .collect(Collectors.toList())); - return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, latestCommit, - metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, null); - }).collect(Collectors.toList()); - }) + return relPartitionPaths.stream() + .map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, latestCommit) + .map(fileSlice -> { + String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); + Option> logPaths = Option.ofNullable(fileSlice.getLogFiles() + .sorted(HoodieLogFile.getLogFileComparator()) + .map(logFile -> logFile.getPath().toString()) + .collect(Collectors.toList())); + return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, latestCommit, + metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, null); + }).collect(Collectors.toList())) .flatMap(Collection::stream) .collect(Collectors.toList()); } @@ -319,16 +335,6 @@ private List buildFileIndex(Path[] paths) { } private InputFormat getBatchInputFormat() { - // When this table has no partition, just return an empty source. - if (!partitionKeys.isEmpty() && getOrFetchPartitions().isEmpty()) { - return new CollectionInputFormat<>(Collections.emptyList(), null); - } - - final Path[] paths = getReadPaths(); - if (paths.length == 0) { - return new CollectionInputFormat<>(Collections.emptyList(), null); - } - final Schema tableAvroSchema = getTableAvroSchema(); final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema); final RowType rowType = (RowType) rowDataType.getLogicalType(); @@ -340,62 +346,37 @@ private List buildFileIndex(Path[] paths) { final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE)); switch (tableType) { case MERGE_ON_READ: - final List inputSplits = buildFileIndex(paths); + final List inputSplits = buildFileIndex(); if (inputSplits.size() == 0) { // When there is no input splits, just return an empty source. LOG.warn("No input splits generate for MERGE_ON_READ input format, returns empty collection instead"); - return new CollectionInputFormat<>(Collections.emptyList(), null); + return InputFormats.EMPTY_INPUT_FORMAT; } - final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState( - rowType, - requiredRowType, - tableAvroSchema.toString(), - AvroSchemaConverter.convertToSchema(requiredRowType).toString(), - inputSplits, - conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")); - return MergeOnReadInputFormat.builder() - .config(this.conf) - .paths(FilePathUtils.toFlinkPaths(paths)) - .tableState(hoodieTableState) - // use the explicit fields data type because the AvroSchemaConverter - // is not very stable. - .fieldTypes(rowDataType.getChildren()) - .defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME)) - .limit(this.limit) - .emitDelete(false) - .build(); + return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, + rowDataType, inputSplits, false); case COPY_ON_WRITE: - FileInputFormat format = new CopyOnWriteInputFormat( - FilePathUtils.toFlinkPaths(paths), - this.schema.getColumnNames().toArray(new String[0]), - this.schema.getColumnDataTypes().toArray(new DataType[0]), - this.requiredPos, - this.conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME), - this.limit == NO_LIMIT_CONSTANT ? Long.MAX_VALUE : this.limit, // ParquetInputFormat always uses the limit value - getParquetConf(this.conf, this.hadoopConf), - this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE) - ); - format.setFilesFilter(new LatestFileFilter(this.hadoopConf)); - return format; + return baseFileOnlyInputFormat(); default: throw new HoodieException("Unexpected table type: " + this.conf.getString(FlinkOptions.TABLE_TYPE)); } case FlinkOptions.QUERY_TYPE_READ_OPTIMIZED: - FileInputFormat format = new CopyOnWriteInputFormat( - FilePathUtils.toFlinkPaths(paths), - this.schema.getColumnNames().toArray(new String[0]), - this.schema.getColumnDataTypes().toArray(new DataType[0]), - this.requiredPos, - "default", - this.limit == NO_LIMIT_CONSTANT ? Long.MAX_VALUE : this.limit, // ParquetInputFormat always uses the limit value - getParquetConf(this.conf, this.hadoopConf), - this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE) - ); - format.setFilesFilter(new LatestFileFilter(this.hadoopConf)); - return format; + return baseFileOnlyInputFormat(); + case FlinkOptions.QUERY_TYPE_INCREMENTAL: + IncrementalInputSplits incrementalInputSplits = IncrementalInputSplits.builder() + .conf(conf).path(FilePathUtils.toFlinkPath(path)) + .maxCompactionMemoryInBytes(maxCompactionMemoryInBytes) + .requiredPartitions(getRequiredPartitionPaths()).build(); + final IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, hadoopConf); + if (result.isEmpty()) { + // When there is no input splits, just return an empty source. + LOG.warn("No input splits generate for incremental read, returns empty collection instead"); + return InputFormats.EMPTY_INPUT_FORMAT; + } + return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, + rowDataType, result.getInputSplits(), false); default: - String errMsg = String.format("Invalid query type : '%s', options ['%s', '%s'] are supported now", queryType, - FlinkOptions.QUERY_TYPE_SNAPSHOT, FlinkOptions.QUERY_TYPE_READ_OPTIMIZED); + String errMsg = String.format("Invalid query type : '%s', options ['%s', '%s', '%s'] are supported now", queryType, + FlinkOptions.QUERY_TYPE_SNAPSHOT, FlinkOptions.QUERY_TYPE_READ_OPTIMIZED, FlinkOptions.QUERY_TYPE_INCREMENTAL); throw new HoodieException(errMsg); } } @@ -408,56 +389,62 @@ private List buildFileIndex(Path[] paths) { final RowType requiredRowType = (RowType) getProducedDataType().notNull().getLogicalType(); final String queryType = this.conf.getString(FlinkOptions.QUERY_TYPE); - org.apache.flink.core.fs.Path[] paths = new org.apache.flink.core.fs.Path[0]; if (FlinkOptions.QUERY_TYPE_SNAPSHOT.equals(queryType)) { final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE)); - switch (tableType) { - case MERGE_ON_READ: - final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState( - rowType, - requiredRowType, - tableAvroSchema.toString(), - AvroSchemaConverter.convertToSchema(requiredRowType).toString(), - Collections.emptyList(), - conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")); - return MergeOnReadInputFormat.builder() - .config(this.conf) - .paths(paths) - .tableState(hoodieTableState) - // use the explicit fields data type because the AvroSchemaConverter - // is not very stable. - .fieldTypes(rowDataType.getChildren()) - .defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME)) - .limit(this.limit) - .emitDelete(true) - .build(); - case COPY_ON_WRITE: - final MergeOnReadTableState hoodieTableState2 = new MergeOnReadTableState( - rowType, - requiredRowType, - tableAvroSchema.toString(), - AvroSchemaConverter.convertToSchema(requiredRowType).toString(), - Collections.emptyList(), - conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")); - return MergeOnReadInputFormat.builder() - .config(this.conf) - .paths(paths) - .tableState(hoodieTableState2) - // use the explicit fields data type because the AvroSchemaConverter - // is not very stable. - .fieldTypes(rowDataType.getChildren()) - .defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME)) - .limit(this.limit) - .build(); - default: - throw new HoodieException("Unexpected table type: " + this.conf.getString(FlinkOptions.TABLE_TYPE)); - } + boolean emitDelete = tableType == HoodieTableType.MERGE_ON_READ; + return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, + rowDataType, Collections.emptyList(), emitDelete); } String errMsg = String.format("Invalid query type : '%s', options ['%s'] are supported now", queryType, FlinkOptions.QUERY_TYPE_SNAPSHOT); throw new HoodieException(errMsg); } + private MergeOnReadInputFormat mergeOnReadInputFormat( + RowType rowType, + RowType requiredRowType, + Schema tableAvroSchema, + DataType rowDataType, + List inputSplits, + boolean emitDelete) { + final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState( + rowType, + requiredRowType, + tableAvroSchema.toString(), + AvroSchemaConverter.convertToSchema(requiredRowType).toString(), + inputSplits, + conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")); + return MergeOnReadInputFormat.builder() + .config(this.conf) + .tableState(hoodieTableState) + // use the explicit fields' data type because the AvroSchemaConverter + // is not very stable. + .fieldTypes(rowDataType.getChildren()) + .defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME)) + .limit(this.limit) + .emitDelete(emitDelete) + .build(); + } + + private InputFormat baseFileOnlyInputFormat() { + final Path[] paths = getReadPaths(); + if (paths.length == 0) { + return InputFormats.EMPTY_INPUT_FORMAT; + } + FileInputFormat format = new CopyOnWriteInputFormat( + FilePathUtils.toFlinkPaths(paths), + this.schema.getColumnNames().toArray(new String[0]), + this.schema.getColumnDataTypes().toArray(new DataType[0]), + this.requiredPos, + this.conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME), + this.limit == NO_LIMIT_CONSTANT ? Long.MAX_VALUE : this.limit, // ParquetInputFormat always uses the limit value + getParquetConf(this.conf, this.hadoopConf), + this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE) + ); + format.setFilesFilter(new LatestFileFilter(this.hadoopConf)); + return format; + } + private Schema inferSchemaFromDdl() { Schema schema = AvroSchemaConverter.convertToSchema(this.schema.toPhysicalRowDataType().getLogicalType()); return HoodieAvroUtils.addMetadataFields(schema, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)); diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java index 1eb7e2db366f1..523062590ea9e 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java @@ -28,7 +28,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -83,7 +82,7 @@ private static boolean needsEscaping(char c) { * @param partitionKVs The partition key value mapping * @param hivePartition Whether the partition path is with Hive style, * e.g. {partition key} = {partition value} - * @param sepSuffix Whether to append the file separator as suffix + * @param sepSuffix Whether to append the path separator as suffix * @return an escaped, valid partition name */ public static String generatePartitionPath( @@ -97,7 +96,7 @@ public static String generatePartitionPath( int i = 0; for (Map.Entry e : partitionKVs.entrySet()) { if (i > 0) { - suffixBuf.append(File.separator); + suffixBuf.append(Path.SEPARATOR); } if (hivePartition) { suffixBuf.append(escapePathName(e.getKey())); @@ -107,7 +106,7 @@ public static String generatePartitionPath( i++; } if (sepSuffix) { - suffixBuf.append(File.separator); + suffixBuf.append(Path.SEPARATOR); } return suffixBuf.toString(); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java index 94fbe028b795e..2c3318362b053 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java @@ -110,10 +110,14 @@ public static GenericRecord buildAvroRecordBySchema( List requiredFields = requiredSchema.getFields(); assert (requiredFields.size() == requiredPos.length); Iterator positionIterator = Arrays.stream(requiredPos).iterator(); - requiredFields.forEach(f -> recordBuilder.set(f, record.get(positionIterator.next()))); + requiredFields.forEach(f -> recordBuilder.set(f, getVal(record, positionIterator.next()))); return recordBuilder.build(); } + private static Object getVal(IndexedRecord record, int pos) { + return pos == -1 ? null : record.get(pos); + } + public static HoodieMergedLogRecordScanner logScanner( MergeOnReadInputSplit split, Schema logSchema, diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java index 415469695b09d..64eb1f4853d74 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java @@ -46,6 +46,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.stream.IntStream; import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createColumnReader; import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createWritableColumnVector; @@ -67,6 +68,8 @@ public class ParquetColumnarRowSplitReader implements Closeable { private final MessageType fileSchema; + private final LogicalType[] requestedTypes; + private final MessageType requestedSchema; /** @@ -81,8 +84,6 @@ public class ParquetColumnarRowSplitReader implements Closeable { private final ColumnarRowData row; - private final LogicalType[] selectedTypes; - private final int batchSize; private ParquetFileReader reader; @@ -121,7 +122,6 @@ public ParquetColumnarRowSplitReader( long splitStart, long splitLength) throws IOException { this.utcTimestamp = utcTimestamp; - this.selectedTypes = selectedTypes; this.batchSize = batchSize; // then we need to apply the predicate push down filter ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); @@ -130,7 +130,13 @@ public ParquetColumnarRowSplitReader( List blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); - this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); + + Type[] types = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); + int[] requestedIndices = IntStream.range(0, types.length).filter(i -> types[i] != null).toArray(); + Type[] readTypes = Arrays.stream(requestedIndices).mapToObj(i -> types[i]).toArray(Type[]::new); + + this.requestedTypes = Arrays.stream(requestedIndices).mapToObj(i -> selectedTypes[i]).toArray(LogicalType[]::new); + this.requestedSchema = Types.buildMessage().addFields(readTypes).named("flink-parquet"); this.reader = new ParquetFileReader( conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); @@ -146,23 +152,37 @@ public ParquetColumnarRowSplitReader( checkSchema(); this.writableVectors = createWritableVectors(); - this.columnarBatch = generator.generate(createReadableVectors()); + ColumnVector[] columnVectors = patchedVector(selectedFieldNames.length, createReadableVectors(), requestedIndices); + this.columnarBatch = generator.generate(columnVectors); this.row = new ColumnarRowData(columnarBatch); } + /** + * Patches the given vectors with nulls. + * The vector position that is not requested (or read from file) is patched as null. + * + * @param fields The total selected fields number + * @param vectors The readable vectors + * @param indices The requested indices from the selected fields + */ + private static ColumnVector[] patchedVector(int fields, ColumnVector[] vectors, int[] indices) { + ColumnVector[] patched = new ColumnVector[fields]; + for (int i = 0; i < indices.length; i++) { + patched[indices[i]] = vectors[i]; + } + return patched; + } + /** * Clips `parquetSchema` according to `fieldNames`. */ - private static MessageType clipParquetSchema( + private static Type[] clipParquetSchema( GroupType parquetSchema, String[] fieldNames, boolean caseSensitive) { Type[] types = new Type[fieldNames.length]; if (caseSensitive) { for (int i = 0; i < fieldNames.length; ++i) { String fieldName = fieldNames[i]; - if (parquetSchema.getFieldIndex(fieldName) < 0) { - throw new IllegalArgumentException(fieldName + " does not exist"); - } - types[i] = parquetSchema.getType(fieldName); + types[i] = parquetSchema.containsField(fieldName) ? parquetSchema.getType(fieldName) : null; } } else { Map caseInsensitiveFieldMap = new HashMap<>(); @@ -178,23 +198,20 @@ private static MessageType clipParquetSchema( } for (int i = 0; i < fieldNames.length; ++i) { Type type = caseInsensitiveFieldMap.get(fieldNames[i].toLowerCase(Locale.ROOT)); - if (type == null) { - throw new IllegalArgumentException(fieldNames[i] + " does not exist"); - } // TODO clip for array,map,row types. types[i] = type; } } - return Types.buildMessage().addFields(types).named("flink-parquet"); + return types; } private WritableColumnVector[] createWritableVectors() { - WritableColumnVector[] columns = new WritableColumnVector[selectedTypes.length]; - for (int i = 0; i < selectedTypes.length; i++) { + WritableColumnVector[] columns = new WritableColumnVector[requestedTypes.length]; + for (int i = 0; i < requestedTypes.length; i++) { columns[i] = createWritableColumnVector( batchSize, - selectedTypes[i], + requestedTypes[i], requestedSchema.getColumns().get(i).getPrimitiveType()); } return columns; @@ -207,7 +224,7 @@ private WritableColumnVector[] createWritableVectors() { private ColumnVector[] createReadableVectors() { ColumnVector[] vectors = new ColumnVector[writableVectors.length]; for (int i = 0; i < writableVectors.length; i++) { - vectors[i] = selectedTypes[i].getTypeRoot() == LogicalTypeRoot.DECIMAL + vectors[i] = requestedTypes[i].getTypeRoot() == LogicalTypeRoot.DECIMAL ? new ParquetDecimalVector(writableVectors[i]) : writableVectors[i]; } @@ -215,10 +232,6 @@ private ColumnVector[] createReadableVectors() { } private void checkSchema() throws IOException, UnsupportedOperationException { - if (selectedTypes.length != requestedSchema.getFieldCount()) { - throw new RuntimeException("The quality of field type is incompatible with the request schema!"); - } - /* * Check that the requested schema is supported. */ @@ -314,7 +327,7 @@ private void readNextRowGroup() throws IOException { for (int i = 0; i < columns.size(); ++i) { columnReaders[i] = createColumnReader( utcTimestamp, - selectedTypes[i], + requestedTypes[i], columns.get(i), pages.getPageReader(columns.get(i))); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetDecimalVector.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetDecimalVector.java index 2bf55b35d4b09..2749f02f36d3b 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetDecimalVector.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetDecimalVector.java @@ -19,12 +19,9 @@ package org.apache.hudi.table.format.cow; import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.DecimalDataUtils; import org.apache.flink.table.data.vector.BytesColumnVector; import org.apache.flink.table.data.vector.ColumnVector; import org.apache.flink.table.data.vector.DecimalColumnVector; -import org.apache.flink.table.data.vector.IntColumnVector; -import org.apache.flink.table.data.vector.LongColumnVector; /** * Parquet write decimal as int32 and int64 and binary, this class wrap the real vector to @@ -43,22 +40,10 @@ public class ParquetDecimalVector implements DecimalColumnVector { @Override public DecimalData getDecimal(int i, int precision, int scale) { - if (DecimalDataUtils.is32BitDecimal(precision)) { - return DecimalData.fromUnscaledLong( - ((IntColumnVector) vector).getInt(i), - precision, - scale); - } else if (DecimalDataUtils.is64BitDecimal(precision)) { - return DecimalData.fromUnscaledLong( - ((LongColumnVector) vector).getLong(i), - precision, - scale); - } else { - return DecimalData.fromUnscaledBytes( - ((BytesColumnVector) vector).getBytes(i).getBytes(), - precision, - scale); - } + return DecimalData.fromUnscaledBytes( + ((BytesColumnVector) vector).getBytes(i).getBytes(), + precision, + scale); } @Override diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java index 778598fa67166..6bb514b429b38 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java @@ -31,7 +31,6 @@ import org.apache.flink.formats.parquet.vector.reader.ShortColumnReader; import org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader; import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.DecimalDataUtils; import org.apache.flink.table.data.TimestampData; import org.apache.flink.table.data.vector.ColumnVector; import org.apache.flink.table.data.vector.VectorizedColumnBatch; @@ -46,7 +45,6 @@ import org.apache.flink.table.data.vector.heap.HeapTimestampVector; import org.apache.flink.table.data.vector.writable.WritableColumnVector; import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.BigIntType; import org.apache.flink.table.types.logical.DecimalType; import org.apache.flink.table.types.logical.IntType; import org.apache.flink.table.types.logical.LogicalType; @@ -110,9 +108,7 @@ public static ParquetColumnarRowSplitReader genPartColumnarRowReader( for (int i = 0; i < vectors.length; i++) { String name = fullFieldNames[selectedFields[i]]; LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType(); - vectors[i] = partitionSpec.containsKey(name) - ? createVectorFromConstant(type, partitionSpec.get(name), batchSize) - : readVectors[selNonPartNames.indexOf(name)]; + vectors[i] = createVector(readVectors, selNonPartNames, name, type, partitionSpec, batchSize); } return new VectorizedColumnBatch(vectors); }; @@ -132,6 +128,24 @@ public static ParquetColumnarRowSplitReader genPartColumnarRowReader( splitLength); } + private static ColumnVector createVector( + ColumnVector[] readVectors, + List selNonPartNames, + String name, + LogicalType type, + Map partitionSpec, + int batchSize) { + if (partitionSpec.containsKey(name)) { + return createVectorFromConstant(type, partitionSpec.get(name), batchSize); + } + ColumnVector readVector = readVectors[selNonPartNames.indexOf(name)]; + if (readVector == null) { + // when the read vector is null, use a constant null vector instead + readVector = createVectorFromConstant(type, null, batchSize); + } + return readVector; + } + private static ColumnVector createVectorFromConstant( LogicalType type, Object value, @@ -197,23 +211,10 @@ private static ColumnVector createVectorFromConstant( DecimalData decimal = value == null ? null : Preconditions.checkNotNull(DecimalData.fromBigDecimal((BigDecimal) value, precision, scale)); - ColumnVector internalVector; - if (DecimalDataUtils.is32BitDecimal(precision)) { - internalVector = createVectorFromConstant( - new IntType(), - decimal == null ? null : (int) decimal.toUnscaledLong(), - batchSize); - } else if (DecimalDataUtils.is64BitDecimal(precision)) { - internalVector = createVectorFromConstant( - new BigIntType(), - decimal == null ? null : decimal.toUnscaledLong(), - batchSize); - } else { - internalVector = createVectorFromConstant( - new VarBinaryType(), - decimal == null ? null : decimal.toUnscaledBytes(), - batchSize); - } + ColumnVector internalVector = createVectorFromConstant( + new VarBinaryType(), + decimal == null ? null : decimal.toUnscaledBytes(), + batchSize); return new ParquetDecimalVector(internalVector); case FLOAT: HeapFloatVector fv = new HeapFloatVector(batchSize); @@ -365,29 +366,10 @@ public static WritableColumnVector createWritableColumnVector( "TIME_MICROS original type is not "); return new HeapTimestampVector(batchSize); case DECIMAL: - DecimalType decimalType = (DecimalType) fieldType; - if (DecimalDataUtils.is32BitDecimal(decimalType.getPrecision())) { - checkArgument( - (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY - || typeName == PrimitiveType.PrimitiveTypeName.INT32) - && primitiveType.getOriginalType() == OriginalType.DECIMAL, - "Unexpected type: %s", typeName); - return new HeapIntVector(batchSize); - } else if (DecimalDataUtils.is64BitDecimal(decimalType.getPrecision())) { - checkArgument( - (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY - || typeName == PrimitiveType.PrimitiveTypeName.INT64) - && primitiveType.getOriginalType() == OriginalType.DECIMAL, - "Unexpected type: %s", typeName); - return new HeapLongVector(batchSize); - } else { - checkArgument( - (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY - || typeName == PrimitiveType.PrimitiveTypeName.BINARY) - && primitiveType.getOriginalType() == OriginalType.DECIMAL, - "Unexpected type: %s", typeName); - return new HeapBytesVector(batchSize); - } + checkArgument(typeName == PrimitiveType.PrimitiveTypeName.BINARY + && primitiveType.getOriginalType() == OriginalType.DECIMAL, + "Unexpected type: %s", typeName); + return new HeapBytesVector(batchSize); default: throw new UnsupportedOperationException(fieldType + " is not supported now."); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java index 2042b96739ef7..2bf5bd58edb1f 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.log.InstantRange; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.exception.HoodieException; @@ -45,7 +46,6 @@ import org.apache.flink.api.common.io.RichInputFormat; import org.apache.flink.api.common.io.statistics.BaseStatistics; import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.fs.Path; import org.apache.flink.core.io.InputSplitAssigner; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; @@ -85,8 +85,6 @@ public class MergeOnReadInputFormat private transient org.apache.hadoop.conf.Configuration hadoopConf; - private Path[] paths; - private final MergeOnReadTableState tableState; /** @@ -128,20 +126,23 @@ public class MergeOnReadInputFormat /** * Flag saying whether to emit the deletes. In streaming read mode, downstream - * operators need the delete messages to retract the legacy accumulator. + * operators need the DELETE messages to retract the legacy accumulator. */ private boolean emitDelete; + /** + * Flag saying whether the input format has been closed. + */ + private boolean closed = true; + private MergeOnReadInputFormat( Configuration conf, - Path[] paths, MergeOnReadTableState tableState, List fieldTypes, String defaultPartName, long limit, boolean emitDelete) { this.conf = conf; - this.paths = paths; this.tableState = tableState; this.fieldNames = tableState.getRowType().getFieldNames(); this.fieldTypes = fieldTypes; @@ -163,9 +164,10 @@ public static Builder builder() { @Override public void open(MergeOnReadInputSplit split) throws IOException { this.currentReadCount = 0L; + this.closed = false; this.hadoopConf = StreamerUtil.getHadoopConf(); if (!(split.getLogPaths().isPresent() && split.getLogPaths().get().size() > 0)) { - if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) { + if (split.getInstantRange() != null) { // base file only with commit time filtering this.iterator = new BaseFileOnlyFilteringIterator( split.getInstantRange(), @@ -208,20 +210,13 @@ public void open(MergeOnReadInputSplit split) throws IOException { + "spark partition Index: " + split.getSplitNumber() + "merge type: " + split.getMergeType()); } + mayShiftInputSplit(split); } @Override public void configure(Configuration configuration) { - if (this.paths.length == 0) { - // file path was not specified yet. Try to set it from the parameters. - String filePath = configuration.getString(FlinkOptions.PATH, null); - if (filePath == null) { - throw new IllegalArgumentException("File path was not specified in input format or configuration."); - } else { - this.paths = new Path[] {new Path(filePath)}; - } - } - // may supports nested files in the future. + // no operation + // may support nested files in the future. } @Override @@ -262,12 +257,32 @@ public void close() throws IOException { this.iterator.close(); } this.iterator = null; + this.closed = true; + } + + public boolean isClosed() { + return this.closed; } // ------------------------------------------------------------------------- // Utilities // ------------------------------------------------------------------------- + /** + * Shifts the input split by its consumed records number. + * + *

Note: This action is time-consuming. + */ + private void mayShiftInputSplit(MergeOnReadInputSplit split) throws IOException { + if (split.isConsumed()) { + // if the input split has been consumed before, + // shift the input split with consumed num of records first + for (long i = 0; i < split.getConsumed() && !reachedEnd(); i++) { + nextRecord(null); + } + } + } + private ParquetColumnarRowSplitReader getFullSchemaReader(String path) throws IOException { return getReader(path, IntStream.range(0, this.tableState.getRowType().getFieldCount()).toArray()); } @@ -433,12 +448,6 @@ public void close() { // ------------------------------------------------------------------------- // Inner Class // ------------------------------------------------------------------------- - - private interface ClosableIterator extends Iterator, AutoCloseable { - @Override - void close(); // override to not throw exception - } - private interface RecordIterator { boolean reachedEnd() throws IOException; @@ -750,7 +759,6 @@ private Option mergeRowWithLog( */ public static class Builder { private Configuration conf; - private Path[] paths; private MergeOnReadTableState tableState; private List fieldTypes; private String defaultPartName; @@ -762,11 +770,6 @@ public Builder config(Configuration conf) { return this; } - public Builder paths(Path[] paths) { - this.paths = paths; - return this; - } - public Builder tableState(MergeOnReadTableState tableState) { this.tableState = tableState; return this; @@ -793,8 +796,8 @@ public Builder emitDelete(boolean emitDelete) { } public MergeOnReadInputFormat build() { - return new MergeOnReadInputFormat(conf, paths, tableState, - fieldTypes, defaultPartName, limit, emitDelete); + return new MergeOnReadInputFormat(conf, tableState, fieldTypes, + defaultPartName, limit, emitDelete); } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputSplit.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputSplit.java index 0c93eeac2edc8..156622c303519 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputSplit.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputSplit.java @@ -33,6 +33,8 @@ public class MergeOnReadInputSplit implements InputSplit { private static final long serialVersionUID = 1L; + private static final long NUM_NO_CONSUMPTION = 0L; + private final int splitNum; private final Option basePath; private final Option> logPaths; @@ -42,6 +44,10 @@ public class MergeOnReadInputSplit implements InputSplit { private final String mergeType; private final Option instantRange; + // for streaming reader to record the consumed offset, + // which is the start of next round reading. + private long consumed = NUM_NO_CONSUMPTION; + public MergeOnReadInputSplit( int splitNum, @Nullable String basePath, @@ -94,6 +100,18 @@ public int getSplitNumber() { return this.splitNum; } + public void consume() { + this.consumed += 1L; + } + + public long getConsumed() { + return consumed; + } + + public boolean isConsumed() { + return this.consumed != NUM_NO_CONSUMPTION; + } + @Override public String toString() { return "MergeOnReadInputSplit{" @@ -107,5 +125,4 @@ public String toString() { + ", instantRange=" + instantRange + '}'; } - } diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java b/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java index a0de79d91a232..e0056f9a1d841 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java +++ b/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java @@ -18,7 +18,7 @@ package org.apache.hudi.util; -import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -76,6 +76,21 @@ public static void setAvroSchema(Configuration conf, HoodieTableMetaClient metaC conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, tableAvroSchema.toString()); } + /** + * Infers the changelog mode based on the data file schema(including metadata fields). + * + *

We can improve the code if the changelog mode is set up as table config. + * + * @param conf The configuration + */ + public static void inferChangelogMode(Configuration conf, HoodieTableMetaClient metaClient) throws Exception { + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchemaFromDataFile(); + if (tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null) { + conf.setBoolean(FlinkOptions.CHANGELOG_ENABLED, true); + } + } + /** * Cleans the metadata file for given instant {@code instant}. */ @@ -95,21 +110,55 @@ public static void cleanInstant(HoodieTableMetaClient metaClient, HoodieInstant } } - public static void rollbackCompaction(HoodieFlinkTable table, HoodieFlinkWriteClient writeClient, Configuration conf) { - String curInstantTime = HoodieActiveTimeline.createNewInstantTime(); - int deltaSeconds = conf.getInteger(FlinkOptions.COMPACTION_DELTA_SECONDS); + public static void rollbackCompaction(HoodieFlinkTable table, String instantTime) { + HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(instantTime); + if (table.getMetaClient().reloadActiveTimeline().filterPendingCompactionTimeline().containsInstant(inflightInstant)) { + LOG.warn("Rollback failed compaction instant: [" + instantTime + "]"); + table.rollbackInflightCompaction(inflightInstant); + } + } + + /** + * Force rolls back all the inflight compaction instants, especially for job failover restart. + * + * @param table The hoodie table + */ + public static void rollbackCompaction(HoodieFlinkTable table) { HoodieTimeline inflightCompactionTimeline = table.getActiveTimeline() .filterPendingCompactionTimeline() .filter(instant -> - instant.getState() == HoodieInstant.State.INFLIGHT - && StreamerUtil.instantTimeDiffSeconds(curInstantTime, instant.getTimestamp()) >= deltaSeconds); + instant.getState() == HoodieInstant.State.INFLIGHT); inflightCompactionTimeline.getInstants().forEach(inflightInstant -> { - LOG.info("Rollback the pending compaction instant: " + inflightInstant); - writeClient.rollbackInflightCompaction(inflightInstant, table); + LOG.info("Rollback the inflight compaction instant: " + inflightInstant + " for failover"); + table.rollbackInflightCompaction(inflightInstant); table.getMetaClient().reloadActiveTimeline(); }); } + /** + * Rolls back the earliest compaction if there exists. + * + *

Makes the strategy not that radical: firstly check whether there exists inflight compaction instants, + * rolls back the first inflight instant only if it has timed out. That means, if there are + * multiple timed out instants on the timeline, we only roll back the first one at a time. + */ + public static void rollbackEarliestCompaction(HoodieFlinkTable table, Configuration conf) { + Option earliestInflight = table.getActiveTimeline() + .filterPendingCompactionTimeline() + .filter(instant -> + instant.getState() == HoodieInstant.State.INFLIGHT).firstInstant(); + if (earliestInflight.isPresent()) { + HoodieInstant instant = earliestInflight.get(); + String currentTime = HoodieActiveTimeline.createNewInstantTime(); + int timeout = conf.getInteger(FlinkOptions.COMPACTION_TIMEOUT_SECONDS); + if (StreamerUtil.instantTimeDiffSeconds(currentTime, instant.getTimestamp()) >= timeout) { + LOG.info("Rollback the inflight compaction instant: " + instant + " for timeout(" + timeout + "s)"); + table.rollbackInflightCompaction(instant); + table.getMetaClient().reloadActiveTimeline(); + } + } + } + /** * Returns whether the execution sequence is LIFO. */ diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/FlinkTables.java b/hudi-flink/src/main/java/org/apache/hudi/util/FlinkTables.java new file mode 100644 index 0000000000000..6918a06b186b8 --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/util/FlinkTables.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieFlinkTable; + +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.configuration.Configuration; + +import static org.apache.hudi.util.StreamerUtil.getHadoopConf; +import static org.apache.hudi.util.StreamerUtil.getHoodieClientConfig; + +/** + * Utilities for {@link org.apache.hudi.table.HoodieFlinkTable}. + */ +public class FlinkTables { + private FlinkTables() { + } + + /** + * Creates the hoodie flink table. + * + *

This expects to be used by client. + */ + public static HoodieFlinkTable createTable(Configuration conf, RuntimeContext runtimeContext) { + HoodieFlinkEngineContext context = new HoodieFlinkEngineContext( + new SerializableConfiguration(getHadoopConf()), + new FlinkTaskContextSupplier(runtimeContext)); + HoodieWriteConfig writeConfig = getHoodieClientConfig(conf, true); + return HoodieFlinkTable.create(writeConfig, context); + } + + /** + * Creates the hoodie flink table. + * + *

This expects to be used by client. + */ + public static HoodieFlinkTable createTable( + HoodieWriteConfig writeConfig, + org.apache.hadoop.conf.Configuration hadoopConf, + RuntimeContext runtimeContext) { + HoodieFlinkEngineContext context = new HoodieFlinkEngineContext( + new SerializableConfiguration(hadoopConf), + new FlinkTaskContextSupplier(runtimeContext)); + return HoodieFlinkTable.create(writeConfig, context); + } + + /** + * Creates the hoodie flink table. + * + *

This expects to be used by driver. + */ + public static HoodieFlinkTable createTable(Configuration conf) { + HoodieWriteConfig writeConfig = StreamerUtil.getHoodieClientConfig(conf, true, false); + return HoodieFlinkTable.create(writeConfig, HoodieFlinkEngineContext.DEFAULT); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/bloom/FlinkHoodieBloomIndex.java b/hudi-flink/src/main/java/org/apache/hudi/util/InputFormats.java similarity index 59% rename from hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/bloom/FlinkHoodieBloomIndex.java rename to hudi-flink/src/main/java/org/apache/hudi/util/InputFormats.java index 355dced71d8ad..f193357e88809 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/bloom/FlinkHoodieBloomIndex.java +++ b/hudi-flink/src/main/java/org/apache/hudi/util/InputFormats.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,17 +16,18 @@ * limitations under the License. */ -package org.apache.hudi.index.bloom; +package org.apache.hudi.util; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.api.java.io.CollectionInputFormat; +import org.apache.flink.table.data.RowData; + +import java.util.Collections; /** - * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata. + * Utilities for all kinds of {@link org.apache.flink.api.common.io.InputFormat}s. */ -@SuppressWarnings("checkstyle:LineLength") -public class FlinkHoodieBloomIndex extends HoodieBaseBloomIndex { - public FlinkHoodieBloomIndex(HoodieWriteConfig config) { - super(config); - } +public class InputFormats { + public static final InputFormat EMPTY_INPUT_FORMAT = + new CollectionInputFormat<>(Collections.emptyList(), null); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java index 04eeab8b377af..3a31253a2c891 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java +++ b/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java @@ -28,18 +28,22 @@ import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCleaningPolicy; -import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieMemoryConfig; +import org.apache.hudi.config.HoodiePayloadConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.schema.FilebasedSchemaProvider; @@ -148,13 +152,27 @@ public static org.apache.hadoop.conf.Configuration getHadoopConf() { return FlinkClientUtil.getHadoopConf(); } + /** + * Mainly used for tests. + */ public static HoodieWriteConfig getHoodieClientConfig(Configuration conf) { + return getHoodieClientConfig(conf, false, false); + } + + public static HoodieWriteConfig getHoodieClientConfig(Configuration conf, boolean loadFsViewStorageConfig) { + return getHoodieClientConfig(conf, false, loadFsViewStorageConfig); + } + + public static HoodieWriteConfig getHoodieClientConfig( + Configuration conf, + boolean enableEmbeddedTimelineService, + boolean loadFsViewStorageConfig) { HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder() .withEngineType(EngineType.FLINK) .withPath(conf.getString(FlinkOptions.PATH)) - .combineInput(conf.getBoolean(FlinkOptions.INSERT_DROP_DUPS), true) - .withMergeAllowDuplicateOnInserts(allowDuplicateInserts(conf)) + .combineInput(conf.getBoolean(FlinkOptions.PRE_COMBINE), true) + .withMergeAllowDuplicateOnInserts(OptionsResolver.insertClustering(conf)) .withCompactionConfig( HoodieCompactionConfig.newBuilder() .withPayloadClass(conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME)) @@ -189,13 +207,24 @@ public static HoodieWriteConfig getHoodieClientConfig(Configuration conf) { .enable(conf.getBoolean(FlinkOptions.METADATA_ENABLED)) .withMaxNumDeltaCommitsBeforeCompaction(conf.getInteger(FlinkOptions.METADATA_COMPACTION_DELTA_COMMITS)) .build()) + .withPayloadConfig(HoodiePayloadConfig.newBuilder() + .withPayloadOrderingField(conf.getString(FlinkOptions.PRECOMBINE_FIELD)) + .withPayloadEventTimeField(conf.getString(FlinkOptions.PRECOMBINE_FIELD)) + .build()) + .withEmbeddedTimelineServerEnabled(enableEmbeddedTimelineService) .withEmbeddedTimelineServerReuseEnabled(true) // make write client embedded timeline service singleton .withAutoCommit(false) .withAllowOperationMetadataField(conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)) - .withProps(flinkConf2TypedProperties(conf)); - - builder = builder.withSchema(getSourceSchema(conf).toString()); - return builder.build(); + .withProps(flinkConf2TypedProperties(conf)) + .withSchema(getSourceSchema(conf).toString()); + + HoodieWriteConfig writeConfig = builder.build(); + if (loadFsViewStorageConfig) { + // do not use the builder to give a change for recovering the original fs view storage config + FileSystemViewStorageConfig viewStorageConfig = ViewStorageProperties.loadFromProperties(conf.getString(FlinkOptions.PATH)); + writeConfig.setViewStorageConfig(viewStorageConfig); + } + return writeConfig; } /** @@ -210,6 +239,8 @@ public static TypedProperties flinkConf2TypedProperties(Configuration conf) { Properties properties = new Properties(); // put all the set options flatConf.addAllToProperties(properties); + // ugly: table keygen clazz, needed by TwoToThreeUpgradeHandler + properties.put(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key(), conf.getString(FlinkOptions.KEYGEN_CLASS_NAME)); // put all the default options for (ConfigOption option : FlinkOptions.optionalOptions()) { if (!flatConf.contains(option) && option.hasDefaultValue()) { @@ -230,27 +261,32 @@ public static void checkRequiredProperties(TypedProperties props, List c * @param conf the configuration * @throws IOException if errors happens when writing metadata */ - public static void initTableIfNotExists(Configuration conf) throws IOException { + public static HoodieTableMetaClient initTableIfNotExists(Configuration conf) throws IOException { final String basePath = conf.getString(FlinkOptions.PATH); final org.apache.hadoop.conf.Configuration hadoopConf = StreamerUtil.getHadoopConf(); if (!tableExists(basePath, hadoopConf)) { - HoodieTableMetaClient.withPropertyBuilder() + HoodieTableMetaClient metaClient = HoodieTableMetaClient.withPropertyBuilder() .setTableType(conf.getString(FlinkOptions.TABLE_TYPE)) .setTableName(conf.getString(FlinkOptions.TABLE_NAME)) .setRecordKeyFields(conf.getString(FlinkOptions.RECORD_KEY_FIELD, null)) .setPayloadClassName(conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME)) + .setPreCombineField(OptionsResolver.getPreCombineField(conf)) .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) .setPartitionFields(conf.getString(FlinkOptions.PARTITION_PATH_FIELD, null)) - .setPreCombineField(conf.getString(FlinkOptions.PRECOMBINE_FIELD)) + .setKeyGeneratorClassProp(conf.getString(FlinkOptions.KEYGEN_CLASS_NAME)) + .setHiveStylePartitioningEnable(conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING)) + .setUrlEncodePartitioning(conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING)) .setTimelineLayoutVersion(1) .initTable(hadoopConf, basePath); LOG.info("Table initialized under base path {}", basePath); + return metaClient; } else { LOG.info("Table [{}/{}] already exists, no need to initialize the table", basePath, conf.getString(FlinkOptions.TABLE_NAME)); + return StreamerUtil.createMetaClient(basePath, hadoopConf); } // Do not close the filesystem in order to use the CACHE, - // some of the filesystems release the handles in #close method. + // some filesystems release the handles in #close method. } /** @@ -274,33 +310,29 @@ public static String generateBucketKey(String partitionPath, String fileId) { } /** - * Returns whether needs to schedule the async compaction. + * Returns whether there is need to schedule the async compaction. * * @param conf The flink configuration. */ public static boolean needsAsyncCompaction(Configuration conf) { - return conf.getString(FlinkOptions.TABLE_TYPE) - .toUpperCase(Locale.ROOT) - .equals(FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + return OptionsResolver.isMorTable(conf) && conf.getBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED); } /** - * Returns whether needs to schedule the compaction plan. + * Returns whether there is need to schedule the compaction plan. * * @param conf The flink configuration. */ public static boolean needsScheduleCompaction(Configuration conf) { - return conf.getString(FlinkOptions.TABLE_TYPE) - .toUpperCase(Locale.ROOT) - .equals(FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + return OptionsResolver.isMorTable(conf) && conf.getBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED); } /** * Creates the meta client for reader. * - *

The streaming pipeline process is long running, so empty table path is allowed, + *

The streaming pipeline process is long-running, so empty table path is allowed, * the reader would then check and refresh the meta client. * * @see org.apache.hudi.source.StreamReadMonitoringFunction @@ -339,6 +371,8 @@ public static HoodieTableMetaClient createMetaClient(Configuration conf) { /** * Creates the Flink write client. + * + *

This expects to be used by client, the driver should start an embedded timeline server. */ public static HoodieFlinkWriteClient createWriteClient(Configuration conf, RuntimeContext runtimeContext) { HoodieFlinkEngineContext context = @@ -346,29 +380,41 @@ public static HoodieFlinkWriteClient createWriteClient(Configuration conf, Runti new SerializableConfiguration(getHadoopConf()), new FlinkTaskContextSupplier(runtimeContext)); - return new HoodieFlinkWriteClient<>(context, getHoodieClientConfig(conf)); + HoodieWriteConfig writeConfig = getHoodieClientConfig(conf, true); + return new HoodieFlinkWriteClient<>(context, writeConfig); } /** * Creates the Flink write client. * + *

This expects to be used by the driver, the client can then send requests for files view. + * *

The task context supplier is a constant: the write token is always '0-1-0'. */ - public static HoodieFlinkWriteClient createWriteClient(Configuration conf) { - return new HoodieFlinkWriteClient<>(HoodieFlinkEngineContext.DEFAULT, getHoodieClientConfig(conf)); + public static HoodieFlinkWriteClient createWriteClient(Configuration conf) throws IOException { + HoodieWriteConfig writeConfig = getHoodieClientConfig(conf, true, false); + // create the filesystem view storage properties for client + FileSystemViewStorageConfig viewStorageConfig = writeConfig.getViewStorageConfig(); + // rebuild the view storage config with simplified options. + FileSystemViewStorageConfig rebuilt = FileSystemViewStorageConfig.newBuilder() + .withStorageType(viewStorageConfig.getStorageType()) + .withRemoteServerHost(viewStorageConfig.getRemoteViewServerHost()) + .withRemoteServerPort(viewStorageConfig.getRemoteViewServerPort()).build(); + ViewStorageProperties.createProperties(conf.getString(FlinkOptions.PATH), rebuilt); + return new HoodieFlinkWriteClient<>(HoodieFlinkEngineContext.DEFAULT, writeConfig); } /** - * Return the median instant time between the given two instant time. + * Returns the median instant time between the given two instant time. */ public static String medianInstantTime(String highVal, String lowVal) { try { - long high = HoodieActiveTimeline.COMMIT_FORMATTER.parse(highVal).getTime(); - long low = HoodieActiveTimeline.COMMIT_FORMATTER.parse(lowVal).getTime(); + long high = HoodieActiveTimeline.parseInstantTime(highVal).getTime(); + long low = HoodieActiveTimeline.parseInstantTime(lowVal).getTime(); ValidationUtils.checkArgument(high > low, "Instant [" + highVal + "] should have newer timestamp than instant [" + lowVal + "]"); long median = low + (high - low) / 2; - return HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date(median)); + return HoodieActiveTimeline.formatInstantTime(new Date(median)); } catch (ParseException e) { throw new HoodieException("Get median instant time with interval [" + lowVal + ", " + highVal + "] error", e); } @@ -379,8 +425,8 @@ public static String medianInstantTime(String highVal, String lowVal) { */ public static long instantTimeDiffSeconds(String newInstantTime, String oldInstantTime) { try { - long newTimestamp = HoodieActiveTimeline.COMMIT_FORMATTER.parse(newInstantTime).getTime(); - long oldTimestamp = HoodieActiveTimeline.COMMIT_FORMATTER.parse(oldInstantTime).getTime(); + long newTimestamp = HoodieActiveTimeline.parseInstantTime(newInstantTime).getTime(); + long oldTimestamp = HoodieActiveTimeline.parseInstantTime(oldInstantTime).getTime(); return (newTimestamp - oldTimestamp) / 1000; } catch (ParseException e) { throw new HoodieException("Get instant time diff with interval [" + oldInstantTime + ", " + newInstantTime + "] error", e); @@ -399,6 +445,10 @@ public static Option createTransformer(List classNames) thr } } + /** + * Returns whether the give file is in valid hoodie format. + * For example, filtering out the empty or corrupt files. + */ public static boolean isValidFile(FileStatus fileStatus) { final String extension = FSUtils.getFileExtension(fileStatus.getPath().toString()); if (PARQUET.getFileExtension().equals(extension)) { @@ -416,11 +466,32 @@ public static boolean isValidFile(FileStatus fileStatus) { return fileStatus.getLen() > 0; } - public static boolean allowDuplicateInserts(Configuration conf) { - WriteOperationType operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)); - return operationType == WriteOperationType.INSERT && !conf.getBoolean(FlinkOptions.INSERT_DEDUP); + public static String getLastPendingInstant(HoodieTableMetaClient metaClient) { + return getLastPendingInstant(metaClient, true); + } + + public static String getLastPendingInstant(HoodieTableMetaClient metaClient, boolean reloadTimeline) { + if (reloadTimeline) { + metaClient.reloadActiveTimeline(); + } + return metaClient.getCommitsTimeline().filterInflights() + .lastInstant() + .map(HoodieInstant::getTimestamp) + .orElse(null); + } + + public static String getLastCompletedInstant(HoodieTableMetaClient metaClient) { + return metaClient.getCommitsTimeline().filterCompletedInstants() + .lastInstant() + .map(HoodieInstant::getTimestamp) + .orElse(null); } + /** + * Returns whether there are successful commits on the timeline. + * @param metaClient The meta client + * @return true if there is any successful commit + */ public static boolean haveSuccessfulCommits(HoodieTableMetaClient metaClient) { return !metaClient.getCommitsTimeline().filterCompletedInstants().empty(); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/ViewStorageProperties.java b/hudi-flink/src/main/java/org/apache/hudi/util/ViewStorageProperties.java new file mode 100644 index 0000000000000..da55e27f0c03b --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/util/ViewStorageProperties.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Date; +import java.util.Properties; + +import static org.apache.hudi.common.table.HoodieTableMetaClient.AUXILIARYFOLDER_NAME; + +/** + * Helper class to read/write {@link FileSystemViewStorageConfig}. + */ +public class ViewStorageProperties { + private static final Logger LOG = LoggerFactory.getLogger(ViewStorageProperties.class); + + private static final String FILE_NAME = "view_storage_conf.properties"; + + /** + * Initialize the {@link #FILE_NAME} meta file. + */ + public static void createProperties( + String basePath, + FileSystemViewStorageConfig config) throws IOException { + Path propertyPath = getPropertiesFilePath(basePath); + FileSystem fs = FSUtils.getFs(basePath, StreamerUtil.getHadoopConf()); + fs.delete(propertyPath, false); + try (FSDataOutputStream outputStream = fs.create(propertyPath)) { + config.getProps().store(outputStream, + "Filesystem view storage properties saved on " + new Date(System.currentTimeMillis())); + } + } + + /** + * Read the {@link FileSystemViewStorageConfig} with given table base path. + */ + public static FileSystemViewStorageConfig loadFromProperties(String basePath) { + Path propertyPath = getPropertiesFilePath(basePath); + LOG.info("Loading filesystem view storage properties from " + propertyPath); + FileSystem fs = FSUtils.getFs(basePath, StreamerUtil.getHadoopConf()); + Properties props = new Properties(); + try { + try (FSDataInputStream inputStream = fs.open(propertyPath)) { + props.load(inputStream); + } + return FileSystemViewStorageConfig.newBuilder().fromProperties(props).build(); + } catch (IOException e) { + throw new HoodieIOException("Could not load filesystem view storage properties from " + propertyPath, e); + } + } + + private static Path getPropertiesFilePath(String basePath) { + String auxPath = basePath + Path.SEPARATOR + AUXILIARYFOLDER_NAME; + return new Path(auxPath, FILE_NAME); + } +} diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java b/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java index 1890d07d2138b..2c8fb490a8781 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java @@ -18,27 +18,13 @@ package org.apache.hudi.sink; -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; import org.apache.hudi.configuration.FlinkOptions; -import org.apache.hudi.sink.compact.CompactFunction; -import org.apache.hudi.sink.compact.CompactionCommitEvent; -import org.apache.hudi.sink.compact.CompactionCommitSink; -import org.apache.hudi.sink.compact.CompactionPlanSourceFunction; -import org.apache.hudi.sink.compact.FlinkCompactionConfig; import org.apache.hudi.sink.transform.ChainedTransformer; import org.apache.hudi.sink.transform.Transformer; import org.apache.hudi.sink.utils.Pipelines; -import org.apache.hudi.table.HoodieFlinkTable; import org.apache.hudi.util.AvroSchemaConverter; -import org.apache.hudi.util.CompactionUtil; import org.apache.hudi.util.StreamerUtil; import org.apache.hudi.utils.TestConfigurations; import org.apache.hudi.utils.TestData; @@ -58,11 +44,6 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.FileProcessingMode; -import org.apache.flink.streaming.api.operators.ProcessOperator; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.config.ExecutionConfigOptions; -import org.apache.flink.table.api.internal.TableEnvironmentImpl; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; @@ -80,8 +61,6 @@ import java.util.Objects; import java.util.concurrent.TimeUnit; -import static org.junit.jupiter.api.Assertions.assertTrue; - /** * Integration test for Flink Hoodie stream sink. */ @@ -150,84 +129,6 @@ public void testWriteToHoodieWithoutTransformer() throws Exception { testWriteToHoodie(null, EXPECTED); } - @Test - public void testHoodieFlinkCompactor() throws Exception { - // Create hoodie table and insert into data. - EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); - TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); - tableEnv.getConfig().getConfiguration() - .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1); - Map options = new HashMap<>(); - options.put(FlinkOptions.COMPACTION_ASYNC_ENABLED.key(), "false"); - options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); - options.put(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); - String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); - tableEnv.executeSql(hoodieTableDDL); - String insertInto = "insert into t1 values\n" - + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n" - + "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n" - + "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n" - + "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n" - + "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n" - + "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n" - + "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n" - + "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')"; - tableEnv.executeSql(insertInto).await(); - - // wait for the asynchronous commit to finish - TimeUnit.SECONDS.sleep(3); - - // Make configuration and setAvroSchema. - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - FlinkCompactionConfig cfg = new FlinkCompactionConfig(); - cfg.path = tempFile.getAbsolutePath(); - Configuration conf = FlinkCompactionConfig.toFlinkConfig(cfg); - conf.setString(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); - - // create metaClient - HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(conf); - - // set the table name - conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); - - // set table schema - CompactionUtil.setAvroSchema(conf, metaClient); - - // judge whether have operation - // To compute the compaction instant time and do compaction. - String compactionInstantTime = CompactionUtil.getCompactionInstantTime(metaClient); - HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf, null); - boolean scheduled = writeClient.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); - - assertTrue(scheduled, "The compaction plan should be scheduled"); - - HoodieFlinkTable table = writeClient.getHoodieTable(); - // generate compaction plan - // should support configurable commit metadata - HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan( - table.getMetaClient(), compactionInstantTime); - - HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); - // Mark instant as compaction inflight - table.getActiveTimeline().transitionCompactionRequestedToInflight(instant); - - env.addSource(new CompactionPlanSourceFunction(compactionPlan, compactionInstantTime)) - .name("compaction_source") - .uid("uid_compaction_source") - .rebalance() - .transform("compact_task", - TypeInformation.of(CompactionCommitEvent.class), - new ProcessOperator<>(new CompactFunction(conf))) - .setParallelism(compactionPlan.getOperations().size()) - .addSink(new CompactionCommitSink(conf)) - .name("clean_commits") - .uid("uid_clean_commits") - .setParallelism(1); - - env.execute("flink_hudi_compaction"); - TestData.checkWrittenFullData(tempFile, EXPECTED); - } - @Test public void testMergeOnReadWriteWithCompaction() throws Exception { int parallelism = 4; diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/TestStreamWriteOperatorCoordinator.java b/hudi-flink/src/test/java/org/apache/hudi/sink/TestStreamWriteOperatorCoordinator.java index f379893229db8..be2e334a4c964 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestStreamWriteOperatorCoordinator.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestStreamWriteOperatorCoordinator.java @@ -20,7 +20,6 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -30,6 +29,7 @@ import org.apache.hudi.sink.utils.MockCoordinatorExecutor; import org.apache.hudi.util.StreamerUtil; import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestUtils; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.jobgraph.OperatorID; @@ -94,8 +94,8 @@ void testInstantState() { coordinator.handleEventFromOperator(1, event1); coordinator.notifyCheckpointComplete(1); - String inflight = coordinator.getWriteClient().getLastPendingInstant(HoodieTableType.COPY_ON_WRITE); - String lastCompleted = coordinator.getWriteClient().getLastCompletedInstant(HoodieTableType.COPY_ON_WRITE); + String inflight = TestUtils.getLastPendingInstant(tempFile.getAbsolutePath()); + String lastCompleted = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); assertThat("Instant should be complete", lastCompleted, is(instant)); assertNotEquals("", inflight, "Should start a new instant"); assertNotEquals(instant, inflight, "Should start a new instant"); @@ -145,7 +145,7 @@ public void testCheckpointCompleteWithPartialEvents() { assertDoesNotThrow(() -> coordinator.notifyCheckpointComplete(1), "Returns early for empty write results"); - String lastCompleted = coordinator.getWriteClient().getLastCompletedInstant(HoodieTableType.COPY_ON_WRITE); + String lastCompleted = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); assertNull(lastCompleted, "Returns early for empty write results"); assertNull(coordinator.getEventBuffer()[0]); @@ -153,7 +153,7 @@ public void testCheckpointCompleteWithPartialEvents() { coordinator.handleEventFromOperator(1, event1); assertDoesNotThrow(() -> coordinator.notifyCheckpointComplete(2), "Commits the instant with partial events anyway"); - lastCompleted = coordinator.getWriteClient().getLastCompletedInstant(HoodieTableType.COPY_ON_WRITE); + lastCompleted = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); assertThat("Commits the instant with partial events anyway", lastCompleted, is(instant)); } @@ -192,7 +192,6 @@ void testSyncMetadataTable() throws Exception { coordinator = new StreamWriteOperatorCoordinator(conf, context); coordinator.start(); coordinator.setExecutor(new MockCoordinatorExecutor(context)); - coordinator.setMetadataSyncExecutor(new MockCoordinatorExecutor(context)); final WriteMetadataEvent event0 = WriteMetadataEvent.emptyBootstrap(0); @@ -205,10 +204,10 @@ void testSyncMetadataTable() throws Exception { HoodieTableMetaClient metadataTableMetaClient = StreamerUtil.createMetaClient(metadataTableBasePath); HoodieTimeline completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(1L)); - assertThat(completedTimeline.lastInstant().get().getTimestamp(), is("0000000000000")); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP)); // test metadata table compaction - // write another 4 commits + // write another 3 commits for (int i = 1; i < 4; i++) { instant = mockWriteWithMetadata(); metadataTableMetaClient.reloadActiveTimeline(); @@ -246,7 +245,13 @@ private static WriteMetadataEvent createOperatorEvent( double failureFraction) { final WriteStatus writeStatus = new WriteStatus(trackSuccessRecords, failureFraction); writeStatus.setPartitionPath(partitionPath); - writeStatus.setStat(new HoodieWriteStat()); + + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setPartitionPath(partitionPath); + writeStat.setFileId("fileId123"); + writeStat.setPath("path123"); + + writeStatus.setStat(writeStat); return WriteMetadataEvent.builder() .taskID(taskId) diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java index b403f3c657209..a91f45263ff25 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java @@ -19,91 +19,44 @@ package org.apache.hudi.sink; import org.apache.hudi.client.HoodieFlinkWriteClient; -import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.FileSystemViewStorageType; import org.apache.hudi.configuration.FlinkOptions; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.sink.event.WriteMetadataEvent; -import org.apache.hudi.sink.utils.InsertFunctionWrapper; -import org.apache.hudi.sink.utils.StreamWriteFunctionWrapper; +import org.apache.hudi.sink.utils.TestWriteBase; import org.apache.hudi.util.StreamerUtil; import org.apache.hudi.utils.TestConfigurations; import org.apache.hudi.utils.TestData; import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.operators.coordination.OperatorEvent; -import org.apache.flink.table.data.RowData; -import org.hamcrest.MatcherAssert; -import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import java.io.File; -import java.util.Comparator; +import java.io.IOException; import java.util.HashMap; -import java.util.List; import java.util.Map; -import java.util.stream.Collectors; -import static org.hamcrest.CoreMatchers.instanceOf; -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertSame; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test cases for stream write. */ -public class TestWriteCopyOnWrite { - - protected static final Map EXPECTED1 = new HashMap<>(); - - protected static final Map EXPECTED2 = new HashMap<>(); - - protected static final Map EXPECTED3 = new HashMap<>(); - - static { - EXPECTED1.put("par1", "[id1,par1,id1,Danny,23,1,par1, id2,par1,id2,Stephen,33,2,par1]"); - EXPECTED1.put("par2", "[id3,par2,id3,Julian,53,3,par2, id4,par2,id4,Fabian,31,4,par2]"); - EXPECTED1.put("par3", "[id5,par3,id5,Sophia,18,5,par3, id6,par3,id6,Emma,20,6,par3]"); - EXPECTED1.put("par4", "[id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); - - EXPECTED2.put("par1", "[id1,par1,id1,Danny,24,1,par1, id2,par1,id2,Stephen,34,2,par1]"); - EXPECTED2.put("par2", "[id3,par2,id3,Julian,54,3,par2, id4,par2,id4,Fabian,32,4,par2]"); - EXPECTED2.put("par3", "[id5,par3,id5,Sophia,18,5,par3, id6,par3,id6,Emma,20,6,par3, " - + "id9,par3,id9,Jane,19,6,par3]"); - EXPECTED2.put("par4", "[id10,par4,id10,Ella,38,7,par4, id11,par4,id11,Phoebe,52,8,par4, " - + "id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); - - EXPECTED3.put("par1", "[id1,par1,id1,Danny,23,1,par1]"); - } +public class TestWriteCopyOnWrite extends TestWriteBase { protected Configuration conf; - protected StreamWriteFunctionWrapper funcWrapper; - @TempDir File tempFile; @BeforeEach - public void before() throws Exception { - final String basePath = tempFile.getAbsolutePath(); - conf = TestConfigurations.getDefaultConf(basePath); + public void before() { + conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); conf.setString(FlinkOptions.TABLE_TYPE, getTableType().name()); setUp(conf); - this.funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf); } /** @@ -113,558 +66,259 @@ protected void setUp(Configuration conf) { // for sub-class extension } - @AfterEach - public void after() throws Exception { - funcWrapper.close(); - } - @Test public void testCheckpoint() throws Exception { - // open the function and ingest data - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - // no checkpoint, so the coordinator does not accept any events - assertTrue( - funcWrapper.getEventBuffer().length == 1 - && funcWrapper.getEventBuffer()[0] == null, "The coordinator events buffer expect to be empty"); - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - String instant = funcWrapper.getWriteClient().getLastPendingInstant(getTableType()); - - final OperatorEvent nextEvent = funcWrapper.getNextEvent(); - MatcherAssert.assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - List writeStatuses = ((WriteMetadataEvent) nextEvent).getWriteStatuses(); - assertNotNull(writeStatuses); - MatcherAssert.assertThat(writeStatuses.size(), is(4)); // write 4 partition files - assertThat(writeStatuses.stream() - .map(WriteStatus::getPartitionPath).sorted(Comparator.naturalOrder()) - .collect(Collectors.joining(",")), - is("par1,par2,par3,par4")); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.REQUESTED, instant); - funcWrapper.checkpointComplete(1); - // the coordinator checkpoint commits the inflight instant. - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - - // checkpoint for next round, no data input, so after the checkpoint, - // there should not be REQUESTED Instant - // this triggers the data write and event send - funcWrapper.checkpointFunction(2); - - String instant2 = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - assertNotEquals(instant, instant2); - - final OperatorEvent nextEvent2 = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent2, instanceOf(WriteMetadataEvent.class)); - List writeStatuses2 = ((WriteMetadataEvent) nextEvent2).getWriteStatuses(); - assertNotNull(writeStatuses2); - assertThat(writeStatuses2.size(), is(0)); // write empty statuses - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent2); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - funcWrapper.checkpointComplete(2); - // started a new instant already - checkInflightInstant(funcWrapper.getWriteClient()); - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + // no checkpoint, so the coordinator does not accept any events + .emptyEventBuffer() + .checkpoint(1) + .assertNextEvent(4, "par1,par2,par3,par4") + .checkpointComplete(1) + // checkpoint for next round, no data input, so after the checkpoint, + // there should not be REQUESTED Instant + // this triggers the data write and event send + .checkpoint(2) + .assertEmptyEvent() + .emptyCheckpoint(2) + .end(); } @Test public void testCheckpointFails() throws Exception { - // open the function and ingest data - funcWrapper.openFunction(); - // no data written and triggers checkpoint fails, - // then we should revert the start instant - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - assertNotNull(instant); - - final OperatorEvent nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - List writeStatuses = ((WriteMetadataEvent) nextEvent).getWriteStatuses(); - assertNotNull(writeStatuses); - assertThat(writeStatuses.size(), is(0)); // no data write - - // fails the checkpoint - funcWrapper.checkpointFails(1); - assertFalse(funcWrapper.getCoordinatorContext().isJobFailed(), - "The last checkpoint was aborted, ignore the events"); - - // the instant metadata should be reused - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.REQUESTED, instant); - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, null); - - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - // this returns early because there is no inflight instant - assertThrows(HoodieException.class, - () -> funcWrapper.checkpointFunction(2), - "Timeout(0ms) while waiting for"); - // do not sent the write event and fails the checkpoint, - // behaves like the last checkpoint is successful. - funcWrapper.checkpointFails(2); + // reset the config option + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, 1L); + preparePipeline(conf) + // no data written and triggers checkpoint fails, + // then we should revert the start instant + .checkpoint(1) + .assertEmptyEvent() + .checkpointFails(1) + .consume(TestData.DATA_SET_INSERT) + .checkpointThrows(2, + "Timeout(1000ms) while waiting for instant initialize") + // do not send the write event and fails the checkpoint, + // behaves like the last checkpoint is successful. + .checkpointFails(2) + .end(); } @Test public void testSubtaskFails() throws Exception { // open the function and ingest data - funcWrapper.openFunction(); - // no data written and triggers checkpoint fails, - // then we should revert the start instant - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - funcWrapper.getNextEvent(); - - String instant1 = funcWrapper.getWriteClient().getLastPendingInstant(getTableType()); - assertNotNull(instant1); - - // fails the subtask - funcWrapper.subTaskFails(0); - - String instant2 = funcWrapper.getWriteClient().getLastPendingInstant(getTableType()); - assertNotEquals(instant2, instant1, "The previous instant should be rolled back when starting new instant"); - - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, null); + preparePipeline() + .checkpoint(1) + .assertEmptyEvent() + .subTaskFails(0) + .noCompleteInstant() + .end(); } @Test public void testInsert() throws Exception { // open the function and ingest data - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - assertEmptyDataFiles(); - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - final OperatorEvent nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.REQUESTED, instant); - funcWrapper.checkpointComplete(1); - checkWrittenData(tempFile, EXPECTED1); - // the coordinator checkpoint commits the inflight instant. - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - checkWrittenData(tempFile, EXPECTED1); + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED1) + .end(); } @Test public void testInsertDuplicates() throws Exception { // reset the config option - conf.setBoolean(FlinkOptions.INSERT_DROP_DUPS, true); - funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf); - - // open the function and ingest data - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) { - funcWrapper.invoke(rowData); - } - - assertEmptyDataFiles(); - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - OperatorEvent nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - funcWrapper.checkpointComplete(1); - - checkWrittenData(tempFile, EXPECTED3, 1); - - // insert duplicates again - for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) { - funcWrapper.invoke(rowData); - } - - funcWrapper.checkpointFunction(2); - - nextEvent = funcWrapper.getNextEvent(); - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - funcWrapper.checkpointComplete(2); - - checkWrittenData(tempFile, EXPECTED3, 1); + conf.setBoolean(FlinkOptions.PRE_COMBINE, true); + preparePipeline(conf) + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED3, 1) + // insert duplicates again + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(EXPECTED3, 1) + .end(); } @Test public void testUpsert() throws Exception { // open the function and ingest data - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - assertEmptyDataFiles(); - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - OperatorEvent nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - funcWrapper.checkpointComplete(1); - - // upsert another data buffer - for (RowData rowData : TestData.DATA_SET_UPDATE_INSERT) { - funcWrapper.invoke(rowData); - } - // the data is not flushed yet - checkWrittenData(tempFile, EXPECTED1); - // this triggers the data write and event send - funcWrapper.checkpointFunction(2); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.REQUESTED, instant); - funcWrapper.checkpointComplete(2); - // the coordinator checkpoint commits the inflight instant. - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - checkWrittenData(tempFile, EXPECTED2); + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + // upsert another data buffer + .consume(TestData.DATA_SET_UPDATE_INSERT) + // the data is not flushed yet + .checkWrittenData(EXPECTED1) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(EXPECTED2) + .end(); } @Test public void testUpsertWithDelete() throws Exception { // open the function and ingest data - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - assertEmptyDataFiles(); - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - OperatorEvent nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - funcWrapper.checkpointComplete(1); - - // upsert another data buffer - for (RowData rowData : TestData.DATA_SET_UPDATE_DELETE) { - funcWrapper.invoke(rowData); - } - // the data is not flushed yet - checkWrittenData(tempFile, EXPECTED1); - // this triggers the data write and event send - funcWrapper.checkpointFunction(2); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.REQUESTED, instant); - funcWrapper.checkpointComplete(2); - // the coordinator checkpoint commits the inflight instant. - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - - Map expected = getUpsertWithDeleteExpected(); - checkWrittenData(tempFile, expected); + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .consume(TestData.DATA_SET_UPDATE_DELETE) + .checkWrittenData(EXPECTED1) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(getUpsertWithDeleteExpected()) + .end(); } @Test public void testInsertWithMiniBatches() throws Exception { // reset the config option conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.0008); // 839 bytes batch size - funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf); - - // open the function and ingest data - funcWrapper.openFunction(); - // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. - // so 3 records expect to trigger a mini-batch write - for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) { - funcWrapper.invoke(rowData); - } - - Map> dataBuffer = funcWrapper.getDataBuffer(); - assertThat("Should have 1 data bucket", dataBuffer.size(), is(1)); - assertThat("3 records expect to flush out as a mini-batch", - dataBuffer.values().stream().findFirst().map(List::size).orElse(-1), - is(3)); - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - dataBuffer = funcWrapper.getDataBuffer(); - assertThat("All data should be flushed out", dataBuffer.size(), is(0)); - - final OperatorEvent event1 = funcWrapper.getNextEvent(); // remove the first event first - final OperatorEvent event2 = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", event2, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, event1); - funcWrapper.getCoordinator().handleEventFromOperator(0, event2); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - funcWrapper.checkpointComplete(1); Map expected = getMiniBatchExpected(); - checkWrittenData(tempFile, expected, 1); - - // started a new instant already - checkInflightInstant(funcWrapper.getWriteClient()); - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - // insert duplicates again - for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) { - funcWrapper.invoke(rowData); - } - - funcWrapper.checkpointFunction(2); - - final OperatorEvent event3 = funcWrapper.getNextEvent(); // remove the first event first - final OperatorEvent event4 = funcWrapper.getNextEvent(); - funcWrapper.getCoordinator().handleEventFromOperator(0, event3); - funcWrapper.getCoordinator().handleEventFromOperator(0, event4); - funcWrapper.checkpointComplete(2); - - // Same the original base file content. - checkWrittenData(tempFile, expected, 1); + preparePipeline(conf) + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(expected, 1) + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + .checkWrittenData(expected, 1) + .end(); } @Test public void testInsertWithDeduplication() throws Exception { // reset the config option conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.0008); // 839 bytes batch size - conf.setBoolean(FlinkOptions.INSERT_DROP_DUPS, true); - funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf); - - // open the function and ingest data - funcWrapper.openFunction(); - // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. - // so 3 records expect to trigger a mini-batch write - for (RowData rowData : TestData.DATA_SET_INSERT_SAME_KEY) { - funcWrapper.invoke(rowData); - } - - Map> dataBuffer = funcWrapper.getDataBuffer(); - assertThat("Should have 1 data bucket", dataBuffer.size(), is(1)); - assertThat("3 records expect to flush out as a mini-batch", - dataBuffer.values().stream().findFirst().map(List::size).orElse(-1), - is(3)); - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - dataBuffer = funcWrapper.getDataBuffer(); - assertThat("All data should be flushed out", dataBuffer.size(), is(0)); - - final OperatorEvent event1 = funcWrapper.getNextEvent(); // remove the first event first - final OperatorEvent event2 = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", event2, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, event1); - funcWrapper.getCoordinator().handleEventFromOperator(0, event2); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - funcWrapper.checkpointComplete(1); + conf.setBoolean(FlinkOptions.PRE_COMBINE, true); Map expected = new HashMap<>(); expected.put("par1", "[id1,par1,id1,Danny,23,4,par1]"); - checkWrittenData(tempFile, expected, 1); - - // started a new instant already - checkInflightInstant(funcWrapper.getWriteClient()); - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - - // insert duplicates again - for (RowData rowData : TestData.DATA_SET_INSERT_SAME_KEY) { - funcWrapper.invoke(rowData); - } - - funcWrapper.checkpointFunction(2); - - final OperatorEvent event3 = funcWrapper.getNextEvent(); // remove the first event first - final OperatorEvent event4 = funcWrapper.getNextEvent(); - funcWrapper.getCoordinator().handleEventFromOperator(0, event3); - funcWrapper.getCoordinator().handleEventFromOperator(0, event4); - funcWrapper.checkpointComplete(2); - - // Same the original base file content. - checkWrittenData(tempFile, expected, 1); + preparePipeline(conf) + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(expected, 1) + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + .checkWrittenData(expected, 1) + .end(); } @Test - public void testInsertAllowsDuplication() throws Exception { - InsertFunctionWrapper funcWrapper = new InsertFunctionWrapper<>(tempFile.getAbsolutePath(), conf); - - // open the function and ingest data - funcWrapper.openFunction(); - // Each record is 208 bytes. so 4 records expect to trigger a mini-batch write - for (RowData rowData : TestData.DATA_SET_INSERT_SAME_KEY) { - funcWrapper.invoke(rowData); - } - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - assertNull(funcWrapper.getWriterHelper()); - - final OperatorEvent event1 = funcWrapper.getNextEvent(); // remove the first event first - assertThat("The operator expect to send an event", event1, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, event1); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - funcWrapper.checkpointComplete(1); - - Map expected = new HashMap<>(); - - expected.put("par1", "[" - + "id1,par1,id1,Danny,23,0,par1, " - + "id1,par1,id1,Danny,23,1,par1, " - + "id1,par1,id1,Danny,23,2,par1, " - + "id1,par1,id1,Danny,23,3,par1, " - + "id1,par1,id1,Danny,23,4,par1]"); - - TestData.checkWrittenAllData(tempFile, expected, 1); - - // started a new instant already - checkInflightInstant(funcWrapper.getWriteClient()); - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - - // insert duplicates again - for (RowData rowData : TestData.DATA_SET_INSERT_SAME_KEY) { - funcWrapper.invoke(rowData); - } - - funcWrapper.checkpointFunction(2); + public void testInsertAppendMode() throws Exception { + prepareInsertPipeline() + // Each record is 208 bytes. so 4 records expect to trigger a mini-batch write + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenAllData(EXPECTED4, 1) + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenFullData(EXPECTED5) + .end(); + } - final OperatorEvent event2 = funcWrapper.getNextEvent(); // remove the first event first - funcWrapper.getCoordinator().handleEventFromOperator(0, event2); - funcWrapper.checkpointComplete(2); + /** + * The test is almost same with {@link #testInsertWithSmallBufferSize} except that + * it is with insert clustering mode. + */ + @Test + public void testInsertClustering() throws Exception { + // reset the config option + conf.setString(FlinkOptions.OPERATION, "insert"); + conf.setBoolean(FlinkOptions.INSERT_CLUSTER, true); + conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0008); // 839 bytes buffer size - // same with the original base file content. - expected.put("par1", "[" - + "id1,par1,id1,Danny,23,0,par1, " - + "id1,par1,id1,Danny,23,0,par1, " - + "id1,par1,id1,Danny,23,1,par1, " - + "id1,par1,id1,Danny,23,1,par1, " - + "id1,par1,id1,Danny,23,2,par1, " - + "id1,par1,id1,Danny,23,2,par1, " - + "id1,par1,id1,Danny,23,3,par1, " - + "id1,par1,id1,Danny,23,3,par1, " - + "id1,par1,id1,Danny,23,4,par1, " - + "id1,par1,id1,Danny,23,4,par1]"); - TestData.checkWrittenAllData(tempFile, expected, 1); + TestWriteMergeOnRead.TestHarness.instance() + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + // flush the max size bucket once at a time. + .preparePipeline(tempFile, conf) + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(EXPECTED4, 1) + // insert duplicates again + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + .checkWrittenFullData(EXPECTED5) + .end(); } @Test public void testInsertWithSmallBufferSize() throws Exception { // reset the config option conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0008); // 839 bytes buffer size - funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf); - - // open the function and ingest data - funcWrapper.openFunction(); - // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. - // so 3 records expect to trigger a mini-batch write - // flush the max size bucket once at a time. - for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) { - funcWrapper.invoke(rowData); - } - - Map> dataBuffer = funcWrapper.getDataBuffer(); - assertThat("Should have 1 data bucket", dataBuffer.size(), is(1)); - assertThat("3 records expect to flush out as a mini-batch", - dataBuffer.values().stream().findFirst().map(List::size).orElse(-1), - is(3)); - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - dataBuffer = funcWrapper.getDataBuffer(); - assertThat("All data should be flushed out", dataBuffer.size(), is(0)); - - for (int i = 0; i < 2; i++) { - final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first - assertThat("The operator expect to send an event", event, instanceOf(WriteMetadataEvent.class)); - funcWrapper.getCoordinator().handleEventFromOperator(0, event); - } - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - funcWrapper.checkpointComplete(1); Map expected = getMiniBatchExpected(); - checkWrittenData(tempFile, expected, 1); - - // started a new instant already - checkInflightInstant(funcWrapper.getWriteClient()); - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - - // insert duplicates again - for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) { - funcWrapper.invoke(rowData); - } - - funcWrapper.checkpointFunction(2); - for (int i = 0; i < 2; i++) { - final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first - funcWrapper.getCoordinator().handleEventFromOperator(0, event); - } - - funcWrapper.checkpointComplete(2); - - // Same the original base file content. - checkWrittenData(tempFile, expected, 1); + preparePipeline(conf) + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + // flush the max size bucket once at a time. + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(expected, 1) + // insert duplicates again + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + // Same the original base file content. + .checkWrittenData(expected, 1) + .end(); } protected Map getMiniBatchExpected() { Map expected = new HashMap<>(); // the last 2 lines are merged expected.put("par1", "[" + + "id1,par1,id1,Danny,23,1,par1, " + "id1,par1,id1,Danny,23,1,par1, " + "id1,par1,id1,Danny,23,1,par1" + "]"); return expected; @@ -687,131 +341,69 @@ protected Map getExpectedBeforeCheckpointComplete() { @Test public void testIndexStateBootstrap() throws Exception { // open the function and ingest data - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - assertEmptyDataFiles(); - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - OperatorEvent nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - funcWrapper.checkpointComplete(1); - - // the data is not flushed yet - checkWrittenData(tempFile, EXPECTED1); + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED1, 4) + .end(); // reset the config option conf.setBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED, true); - funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf); - - // upsert another data buffer - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_UPDATE_INSERT) { - funcWrapper.invoke(rowData); - } - - checkIndexLoaded( - new HoodieKey("id1", "par1"), - new HoodieKey("id2", "par1"), - new HoodieKey("id3", "par2"), - new HoodieKey("id4", "par2"), - new HoodieKey("id5", "par3"), - new HoodieKey("id6", "par3"), - new HoodieKey("id7", "par4"), - new HoodieKey("id8", "par4"), - new HoodieKey("id9", "par3"), - new HoodieKey("id10", "par4"), - new HoodieKey("id11", "par4")); - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - - assertTrue(funcWrapper.isAlreadyBootstrap()); - - String instant = funcWrapper.getWriteClient() - .getLastPendingInstant(getTableType()); - - nextEvent = funcWrapper.getNextEvent(); - assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); - - Map expected = getExpectedBeforeCheckpointComplete(); - checkWrittenData(tempFile, expected); - - funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); - assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event"); - - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.REQUESTED, instant); - - funcWrapper.checkpointComplete(1); - // the coordinator checkpoint commits the inflight instant. - checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant); - checkWrittenData(tempFile, EXPECTED2); + preparePipeline(conf) + .consume(TestData.DATA_SET_UPDATE_INSERT) + .checkIndexLoaded( + new HoodieKey("id1", "par1"), + new HoodieKey("id2", "par1"), + new HoodieKey("id3", "par2"), + new HoodieKey("id4", "par2"), + new HoodieKey("id5", "par3"), + new HoodieKey("id6", "par3"), + new HoodieKey("id7", "par4"), + new HoodieKey("id8", "par4"), + new HoodieKey("id9", "par3"), + new HoodieKey("id10", "par4"), + new HoodieKey("id11", "par4")) + .checkpoint(1) + .assertBootstrapped() + .assertNextEvent() + .checkWrittenData(getExpectedBeforeCheckpointComplete()) + .checkpointComplete(1) + .checkWrittenData(EXPECTED2) + .end(); } @Test public void testWriteExactlyOnce() throws Exception { // reset the config option - conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, 3); + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, 1L); conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0006); // 630 bytes buffer size - funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf); - - // open the function and ingest data - - funcWrapper.openFunction(); - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - // no checkpoint, so the coordinator does not accept any events - assertTrue( - funcWrapper.getEventBuffer().length == 1 - && funcWrapper.getEventBuffer()[0] == null, "The coordinator events buffer expect to be empty"); - - // this triggers the data write and event send - funcWrapper.checkpointFunction(1); - assertTrue(funcWrapper.isConforming(), "The write function should be waiting for the instant to commit"); - - for (int i = 0; i < 2; i++) { - final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first - assertThat("The operator expect to send an event", event, instanceOf(WriteMetadataEvent.class)); - funcWrapper.getCoordinator().handleEventFromOperator(0, event); - } - - funcWrapper.checkpointComplete(1); - - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - - assertFalse(funcWrapper.isConforming(), "The write function should finish waiting for the instant to commit"); - - // checkpoint for the next round, when there is eager flush but the write - // task is waiting for the instant commit ack, should throw for timeout. - funcWrapper.checkpointFunction(2); - - assertThrows(HoodieException.class, () -> { - for (RowData rowData : TestData.DATA_SET_INSERT) { - funcWrapper.invoke(rowData); - } - }, "Timeout(500ms) while waiting for instant"); + preparePipeline(conf) + .consume(TestData.DATA_SET_INSERT) + .emptyEventBuffer() + .checkpoint(1) + .assertConfirming() + .handleEvents(4) + .checkpointComplete(1) + .consume(TestData.DATA_SET_INSERT) + .assertNotConfirming() + .checkpoint(2) + .assertConsumeThrows(TestData.DATA_SET_INSERT, + "Timeout(1000ms) while waiting for instant initialize") + .end(); } @Test - public void testReuseEmbeddedServer() { - HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf, null); + public void testReuseEmbeddedServer() throws IOException { + HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf); FileSystemViewStorageConfig viewStorageConfig = writeClient.getConfig().getViewStorageConfig(); assertSame(viewStorageConfig.getStorageType(), FileSystemViewStorageType.REMOTE_FIRST); // get another write client - writeClient = StreamerUtil.createWriteClient(conf, null); + writeClient = StreamerUtil.createWriteClient(conf); assertSame(writeClient.getConfig().getViewStorageConfig().getStorageType(), FileSystemViewStorageType.REMOTE_FIRST); assertEquals(viewStorageConfig.getRemoteViewServerPort(), writeClient.getConfig().getViewStorageConfig().getRemoteViewServerPort()); } @@ -820,56 +412,19 @@ public void testReuseEmbeddedServer() { // Utilities // ------------------------------------------------------------------------- - @SuppressWarnings("rawtypes") - private void checkInflightInstant(HoodieFlinkWriteClient writeClient) { - final String instant = writeClient.getLastPendingInstant(getTableType()); - assertNotNull(instant); - } - - @SuppressWarnings("rawtypes") - private void checkInstantState( - HoodieFlinkWriteClient writeClient, - HoodieInstant.State state, - String instantStr) { - final String instant; - switch (state) { - case REQUESTED: - instant = writeClient.getLastPendingInstant(getTableType()); - break; - case COMPLETED: - instant = writeClient.getLastCompletedInstant(getTableType()); - break; - default: - throw new AssertionError("Unexpected state"); - } - assertThat(instant, is(instantStr)); + private TestHarness preparePipeline() throws Exception { + return TestHarness.instance().preparePipeline(tempFile, conf); } - protected HoodieTableType getTableType() { - return HoodieTableType.COPY_ON_WRITE; + private TestHarness preparePipeline(Configuration conf) throws Exception { + return TestHarness.instance().preparePipeline(tempFile, conf); } - protected void checkWrittenData(File baseFile, Map expected) throws Exception { - checkWrittenData(baseFile, expected, 4); + protected TestHarness prepareInsertPipeline() throws Exception { + return TestHarness.instance().preparePipeline(tempFile, conf, true); } - protected void checkWrittenData(File baseFile, Map expected, int partitions) throws Exception { - TestData.checkWrittenData(baseFile, expected, partitions); - } - - /** - * Asserts the data files are empty. - */ - protected void assertEmptyDataFiles() { - File[] dataFiles = tempFile.listFiles(file -> !file.getName().startsWith(".")); - assertNotNull(dataFiles); - assertThat(dataFiles.length, is(0)); - } - - private void checkIndexLoaded(HoodieKey... keys) { - for (HoodieKey key : keys) { - assertTrue(funcWrapper.isKeyInState(key), - "Key: " + key + " assumes to be in the index state"); - } + protected HoodieTableType getTableType() { + return HoodieTableType.COPY_ON_WRITE; } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnRead.java b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnRead.java index 07e23b56edc92..a35a0ac8d0b88 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnRead.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnRead.java @@ -18,49 +18,18 @@ package org.apache.hudi.sink; -import org.apache.hudi.client.FlinkTaskContextSupplier; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.config.SerializableConfiguration; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.TableSchemaResolver; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.configuration.FlinkOptions; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.util.StreamerUtil; -import org.apache.hudi.utils.TestData; -import org.apache.avro.Schema; import org.apache.flink.configuration.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.junit.jupiter.api.BeforeEach; -import java.io.File; -import java.util.Comparator; import java.util.HashMap; import java.util.Map; -import java.util.stream.Collectors; /** * Test cases for delta stream write. */ public class TestWriteMergeOnRead extends TestWriteCopyOnWrite { - private FileSystem fs; - private HoodieWriteConfig writeConfig; - private HoodieFlinkEngineContext context; - - @BeforeEach - public void before() throws Exception { - super.before(); - fs = FSUtils.getFs(tempFile.getAbsolutePath(), new org.apache.hadoop.conf.Configuration()); - writeConfig = StreamerUtil.getHoodieClientConfig(conf); - context = new HoodieFlinkEngineContext( - new SerializableConfiguration(StreamerUtil.getHadoopConf()), - new FlinkTaskContextSupplier(null)); - } @Override protected void setUp(Configuration conf) { @@ -68,17 +37,8 @@ protected void setUp(Configuration conf) { } @Override - protected void checkWrittenData(File baseFile, Map expected, int partitions) throws Exception { - HoodieTableMetaClient metaClient = HoodieFlinkTable.create(writeConfig, context).getMetaClient(); - Schema schema = new TableSchemaResolver(metaClient).getTableAvroSchema(); - String latestInstant = metaClient.getCommitsTimeline().filterCompletedInstants() - .getInstants() - .filter(x -> x.getAction().equals(HoodieActiveTimeline.DELTA_COMMIT_ACTION)) - .map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()).stream() - .max(Comparator.naturalOrder()) - .orElse(null); - TestData.checkWrittenDataMOR(fs, latestInstant, baseFile, expected, partitions, schema); + public void testInsertClustering() { + // insert clustering is only valid for cow table. } @Override diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnReadWithCompact.java b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnReadWithCompact.java index acce120f484f0..704d94caba395 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnReadWithCompact.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnReadWithCompact.java @@ -37,6 +37,11 @@ protected void setUp(Configuration conf) { conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); } + @Override + public void testInsertClustering() { + // insert clustering is only valid for cow table. + } + @Override protected Map getExpectedBeforeCheckpointComplete() { return EXPECTED1; diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/compact/ITTestHoodieFlinkCompactor.java b/hudi-flink/src/test/java/org/apache/hudi/sink/compact/ITTestHoodieFlinkCompactor.java new file mode 100644 index 0000000000000..52002b1180bcc --- /dev/null +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/compact/ITTestHoodieFlinkCompactor.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestSQL; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.operators.ProcessOperator; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * IT cases for {@link org.apache.hudi.common.model.HoodieRecord}. + */ +public class ITTestHoodieFlinkCompactor { + private static final Map> EXPECTED = new HashMap<>(); + + static { + EXPECTED.put("par1", Arrays.asList("id1,par1,id1,Danny,23,1000,par1", "id2,par1,id2,Stephen,33,2000,par1")); + EXPECTED.put("par2", Arrays.asList("id3,par2,id3,Julian,53,3000,par2", "id4,par2,id4,Fabian,31,4000,par2")); + EXPECTED.put("par3", Arrays.asList("id5,par3,id5,Sophia,18,5000,par3", "id6,par3,id6,Emma,20,6000,par3")); + EXPECTED.put("par4", Arrays.asList("id7,par4,id7,Bob,44,7000,par4", "id8,par4,id8,Han,56,8000,par4")); + } + + @TempDir + File tempFile; + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testHoodieFlinkCompactor(boolean enableChangelog) throws Exception { + // Create hoodie table and insert into data. + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + tableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1); + Map options = new HashMap<>(); + options.put(FlinkOptions.COMPACTION_ASYNC_ENABLED.key(), "false"); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + options.put(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), enableChangelog + ""); + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(3); + + // Make configuration and setAvroSchema. + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + FlinkCompactionConfig cfg = new FlinkCompactionConfig(); + cfg.path = tempFile.getAbsolutePath(); + Configuration conf = FlinkCompactionConfig.toFlinkConfig(cfg); + conf.setString(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + + // create metaClient + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(conf); + + // set the table name + conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); + + // set table schema + CompactionUtil.setAvroSchema(conf, metaClient); + + // infer changelog mode + CompactionUtil.inferChangelogMode(conf, metaClient); + + // judge whether have operation + // To compute the compaction instant time and do compaction. + String compactionInstantTime = CompactionUtil.getCompactionInstantTime(metaClient); + HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf); + boolean scheduled = writeClient.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); + + assertTrue(scheduled, "The compaction plan should be scheduled"); + + HoodieFlinkTable table = writeClient.getHoodieTable(); + // generate compaction plan + // should support configurable commit metadata + HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan( + table.getMetaClient(), compactionInstantTime); + + HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); + // Mark instant as compaction inflight + table.getActiveTimeline().transitionCompactionRequestedToInflight(instant); + + env.addSource(new CompactionPlanSourceFunction(compactionPlan, compactionInstantTime)) + .name("compaction_source") + .uid("uid_compaction_source") + .rebalance() + .transform("compact_task", + TypeInformation.of(CompactionCommitEvent.class), + new ProcessOperator<>(new CompactFunction(conf))) + .setParallelism(compactionPlan.getOperations().size()) + .addSink(new CompactionCommitSink(conf)) + .name("clean_commits") + .uid("uid_clean_commits") + .setParallelism(1); + + env.execute("flink_hudi_compaction"); + writeClient.close(); + TestData.checkWrittenFullData(tempFile, EXPECTED); + } +} diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/partitioner/TestBucketAssigner.java b/hudi-flink/src/test/java/org/apache/hudi/sink/partitioner/TestBucketAssigner.java index d10421d6606af..053c2a39c8e09 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/partitioner/TestBucketAssigner.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/partitioner/TestBucketAssigner.java @@ -22,9 +22,6 @@ import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.sink.partitioner.profile.WriteProfile; @@ -51,9 +48,9 @@ import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; /** @@ -358,11 +355,11 @@ public void testWriteProfileReload() throws Exception { assertTrue(smallFiles1.isEmpty(), "Should have no small files"); TestData.writeData(TestData.DATA_SET_INSERT, conf); - Option instantOption = getLastCompleteInstant(writeProfile); - assertFalse(instantOption.isPresent()); + String instantOption = getLastCompleteInstant(writeProfile); + assertNull(instantOption); writeProfile.reload(1); - String instant1 = getLastCompleteInstant(writeProfile).orElse(null); + String instant1 = getLastCompleteInstant(writeProfile); assertNotNull(instant1); List smallFiles2 = writeProfile.getSmallFiles("par1"); assertThat("Should have 1 small file", smallFiles2.size(), is(1)); @@ -376,7 +373,7 @@ public void testWriteProfileReload() throws Exception { smallFiles3.get(0).location.getInstantTime(), is(instant1)); writeProfile.reload(2); - String instant2 = getLastCompleteInstant(writeProfile).orElse(null); + String instant2 = getLastCompleteInstant(writeProfile); assertNotEquals(instant2, instant1, "Should have new complete instant"); List smallFiles4 = writeProfile.getSmallFiles("par1"); assertThat("Should have 1 small file", smallFiles4.size(), is(1)); @@ -389,12 +386,11 @@ public void testWriteProfileMetadataCache() throws Exception { WriteProfile writeProfile = new WriteProfile(writeConfig, context); assertTrue(writeProfile.getMetadataCache().isEmpty(), "Empty table should no have any instant metadata"); - HoodieTimeline emptyTimeline = writeProfile.getTable().getActiveTimeline(); - // write 3 instants of data for (int i = 0; i < 3; i++) { TestData.writeData(TestData.DATA_SET_INSERT, conf); } + // the record profile triggers the metadata loading writeProfile.reload(1); assertThat("Metadata cache should have same number entries as timeline instants", writeProfile.getMetadataCache().size(), is(3)); @@ -402,15 +398,10 @@ public void testWriteProfileMetadataCache() throws Exception { writeProfile.getSmallFiles("par1"); assertThat("The metadata should be reused", writeProfile.getMetadataCache().size(), is(3)); - - writeProfile.reload(2); - writeProfile.initFSViewIfNecessary(emptyTimeline); - assertTrue(writeProfile.getMetadataCache().isEmpty(), "Metadata cache should be all cleaned"); } - private static Option getLastCompleteInstant(WriteProfile profile) { - return profile.getTable().getMetaClient().getCommitsTimeline() - .filterCompletedInstants().lastInstant().map(HoodieInstant::getTimestamp); + private static String getLastCompleteInstant(WriteProfile profile) { + return StreamerUtil.getLastCompletedInstant(profile.getTable().getMetaClient()); } private void assertBucketEquals( diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java index ed23754d945b1..642a407c1c7d0 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java @@ -18,7 +18,6 @@ package org.apache.hudi.sink.utils; -import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.sink.StreamWriteOperatorCoordinator; import org.apache.hudi.sink.append.AppendWriteFunction; import org.apache.hudi.sink.bulk.BulkInsertWriterHelper; @@ -36,6 +35,7 @@ import org.apache.flink.runtime.operators.testutils.MockEnvironment; import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.api.operators.collect.utils.MockFunctionSnapshotContext; import org.apache.flink.streaming.api.operators.collect.utils.MockOperatorEventGateway; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.RowType; @@ -47,7 +47,7 @@ * * @param Input type */ -public class InsertFunctionWrapper { +public class InsertFunctionWrapper implements TestFunctionWrapper { private final Configuration conf; private final RowType rowType; @@ -98,16 +98,11 @@ public OperatorEvent getNextEvent() { return this.gateway.getNextEvent(); } - @SuppressWarnings("rawtypes") - public HoodieFlinkWriteClient getWriteClient() { - return this.writeFunction.getWriteClient(); - } - public void checkpointFunction(long checkpointId) throws Exception { // checkpoint the coordinator first this.coordinator.checkpointCoordinator(checkpointId, new CompletableFuture<>()); - writeFunction.snapshotState(null); + writeFunction.snapshotState(new MockFunctionSnapshotContext(checkpointId)); stateInitializationContext.getOperatorStateStore().checkpointBegin(checkpointId); } @@ -120,6 +115,11 @@ public StreamWriteOperatorCoordinator getCoordinator() { return coordinator; } + @Override + public void close() throws Exception { + this.coordinator.close(); + } + public BulkInsertWriterHelper getWriterHelper() { return this.writeFunction.getWriterHelper(); } diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockCoordinatorExecutor.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockCoordinatorExecutor.java index 099dfd63f4b3f..7e84453aa375e 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockCoordinatorExecutor.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockCoordinatorExecutor.java @@ -18,6 +18,8 @@ package org.apache.hudi.sink.utils; +import org.apache.hudi.exception.HoodieException; + import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.function.ThrowingRunnable; @@ -25,17 +27,21 @@ import org.slf4j.LoggerFactory; /** - * A mock {@link CoordinatorExecutor} that executes the actions synchronously. + * A mock {@link NonThrownExecutor} that executes the actions synchronously. */ -public class MockCoordinatorExecutor extends CoordinatorExecutor { +public class MockCoordinatorExecutor extends NonThrownExecutor { private static final Logger LOG = LoggerFactory.getLogger(MockCoordinatorExecutor.class); public MockCoordinatorExecutor(OperatorCoordinator.Context context) { - super(context, LOG); + super(LOG, (errMsg, t) -> context.failJob(new HoodieException(errMsg, t)), true); } @Override - public void execute(ThrowingRunnable action, String actionName, Object... actionParams) { + public void execute( + ThrowingRunnable action, + ExceptionHook hook, + String actionName, + Object... actionParams) { final String actionString = String.format(actionName, actionParams); try { action.run(); @@ -43,9 +49,12 @@ public void execute(ThrowingRunnable action, String actionName, Objec } catch (Throwable t) { // if we have a JVM critical error, promote it immediately, there is a good // chance the - // logging or job failing will not succeed any more + // logging or job failing will not succeed anymore ExceptionUtils.rethrowIfFatalErrorOrOOM(t); - exceptionHook(actionString, t); + final String errMsg = String.format("Executor executes action [%s] error", actionString); + if (hook != null) { + hook.apply(errMsg, t); + } } } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java index c5d3ec5a21221..54a142a25b4c2 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java @@ -18,7 +18,6 @@ package org.apache.hudi.sink.utils; -import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.configuration.FlinkOptions; @@ -46,6 +45,7 @@ import org.apache.flink.streaming.api.graph.StreamConfig; import org.apache.flink.streaming.api.operators.Output; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.api.operators.collect.utils.MockFunctionSnapshotContext; import org.apache.flink.streaming.api.operators.collect.utils.MockOperatorEventGateway; import org.apache.flink.streaming.runtime.streamrecord.StreamElement; import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; @@ -66,7 +66,7 @@ * * @param Input type */ -public class StreamWriteFunctionWrapper { +public class StreamWriteFunctionWrapper implements TestFunctionWrapper { private final Configuration conf; private final IOManager ioManager; @@ -142,9 +142,6 @@ public StreamWriteFunctionWrapper(String tablePath, Configuration conf) throws E public void openFunction() throws Exception { this.coordinator.start(); this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); - if (conf.getBoolean(FlinkOptions.METADATA_ENABLED)) { - this.coordinator.setMetadataSyncExecutor(new MockCoordinatorExecutor(coordinatorContext)); - } toHoodieFunction = new RowDataToHoodieFunction<>(TestConfigurations.ROW_TYPE, conf); toHoodieFunction.setRuntimeContext(runtimeContext); toHoodieFunction.open(conf); @@ -214,11 +211,6 @@ public Map> getDataBuffer() { return this.writeFunction.getDataBuffer(); } - @SuppressWarnings("rawtypes") - public HoodieFlinkWriteClient getWriteClient() { - return this.writeFunction.getWriteClient(); - } - public void checkpointFunction(long checkpointId) throws Exception { // checkpoint the coordinator first this.coordinator.checkpointCoordinator(checkpointId, new CompletableFuture<>()); @@ -227,7 +219,7 @@ public void checkpointFunction(long checkpointId) throws Exception { } bucketAssignerFunction.snapshotState(null); - writeFunction.snapshotState(null); + writeFunction.snapshotState(new MockFunctionSnapshotContext(checkpointId)); stateInitializationContext.getOperatorStateStore().checkpointBegin(checkpointId); } diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java new file mode 100644 index 0000000000000..d2fe8196502c3 --- /dev/null +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; +import org.apache.hudi.sink.event.WriteMetadataEvent; + +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +import java.util.List; +import java.util.Map; + +/** + * Define the common interfaces for test function wrappers. + */ +public interface TestFunctionWrapper { + /** + * Open all the functions within this wrapper. + */ + void openFunction() throws Exception; + + /** + * Process the given input record {@code record}. + */ + void invoke(I record) throws Exception; + + /** + * Returns the event buffer sent by the write tasks. + */ + WriteMetadataEvent[] getEventBuffer(); + + /** + * Returns the next event. + */ + OperatorEvent getNextEvent(); + + /** + * Snapshot all the functions in the wrapper. + */ + void checkpointFunction(long checkpointId) throws Exception; + + /** + * Mark checkpoint with id {code checkpointId} as success. + */ + void checkpointComplete(long checkpointId); + + /** + * Returns the operator coordinator. + */ + StreamWriteOperatorCoordinator getCoordinator(); + + /** + * Returns the data buffer of the write task. + */ + default Map> getDataBuffer() { + throw new UnsupportedOperationException(); + } + + /** + * Mark checkpoint with id {code checkpointId} as failed. + */ + default void checkpointFails(long checkpointId) { + throw new UnsupportedOperationException(); + } + + /** + * Returns the context of the coordinator. + */ + default MockOperatorCoordinatorContext getCoordinatorContext() { + throw new UnsupportedOperationException(); + } + + /** + * Mark sub-task with id {@code taskId} as failed. + */ + default void subTaskFails(int taskId) throws Exception { + throw new UnsupportedOperationException(); + } + + /** + * Returns whether the given key {@code key} is in the state store. + */ + default boolean isKeyInState(HoodieKey key) { + throw new UnsupportedOperationException(); + } + + /** + * Returns whether the bootstrap function already bootstrapped. + */ + default boolean isAlreadyBootstrap() throws Exception { + throw new UnsupportedOperationException(); + } + + /** + * Returns whether the write task is confirming. + */ + default boolean isConforming() { + throw new UnsupportedOperationException(); + } + + /** + * Close this function wrapper. + */ + void close() throws Exception; +} diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java new file mode 100644 index 0000000000000..a03f870296db7 --- /dev/null +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestUtils; + +import org.apache.avro.Schema; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.fs.FileSystem; +import org.hamcrest.MatcherAssert; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.instanceOf; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Base class for write test cases. + */ +public class TestWriteBase { + protected static final Map EXPECTED1 = new HashMap<>(); + + protected static final Map EXPECTED2 = new HashMap<>(); + + protected static final Map EXPECTED3 = new HashMap<>(); + + protected static final Map EXPECTED4 = new HashMap<>(); + + protected static final Map> EXPECTED5 = new HashMap<>(); + + static { + EXPECTED1.put("par1", "[id1,par1,id1,Danny,23,1,par1, id2,par1,id2,Stephen,33,2,par1]"); + EXPECTED1.put("par2", "[id3,par2,id3,Julian,53,3,par2, id4,par2,id4,Fabian,31,4,par2]"); + EXPECTED1.put("par3", "[id5,par3,id5,Sophia,18,5,par3, id6,par3,id6,Emma,20,6,par3]"); + EXPECTED1.put("par4", "[id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); + + EXPECTED2.put("par1", "[id1,par1,id1,Danny,24,1,par1, id2,par1,id2,Stephen,34,2,par1]"); + EXPECTED2.put("par2", "[id3,par2,id3,Julian,54,3,par2, id4,par2,id4,Fabian,32,4,par2]"); + EXPECTED2.put("par3", "[id5,par3,id5,Sophia,18,5,par3, id6,par3,id6,Emma,20,6,par3, " + + "id9,par3,id9,Jane,19,6,par3]"); + EXPECTED2.put("par4", "[id10,par4,id10,Ella,38,7,par4, id11,par4,id11,Phoebe,52,8,par4, " + + "id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); + + EXPECTED3.put("par1", "[id1,par1,id1,Danny,23,1,par1]"); + + EXPECTED4.put("par1", "[" + + "id1,par1,id1,Danny,23,0,par1, " + + "id1,par1,id1,Danny,23,1,par1, " + + "id1,par1,id1,Danny,23,2,par1, " + + "id1,par1,id1,Danny,23,3,par1, " + + "id1,par1,id1,Danny,23,4,par1]"); + + EXPECTED5.put("par1", Arrays.asList( + "id1,par1,id1,Danny,23,0,par1", + "id1,par1,id1,Danny,23,0,par1", + "id1,par1,id1,Danny,23,1,par1", + "id1,par1,id1,Danny,23,1,par1", + "id1,par1,id1,Danny,23,2,par1", + "id1,par1,id1,Danny,23,2,par1", + "id1,par1,id1,Danny,23,3,par1", + "id1,par1,id1,Danny,23,3,par1", + "id1,par1,id1,Danny,23,4,par1", + "id1,par1,id1,Danny,23,4,par1")); + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Utils to composite the test stages. + */ + public static class TestHarness { + public static TestHarness instance() { + return new TestHarness(); + } + + private File baseFile; + private String basePath; + private Configuration conf; + private TestFunctionWrapper pipeline; + + private String lastPending; + private String lastComplete; + + public TestHarness preparePipeline(File basePath, Configuration conf) throws Exception { + preparePipeline(basePath, conf, false); + return this; + } + + public TestHarness preparePipeline(File basePath, Configuration conf, boolean append) throws Exception { + this.baseFile = basePath; + this.basePath = this.baseFile.getAbsolutePath(); + this.conf = conf; + this.pipeline = append + ? new InsertFunctionWrapper<>(this.basePath, conf) + : new StreamWriteFunctionWrapper<>(this.basePath, conf); + // open the function and ingest data + this.pipeline.openFunction(); + return this; + } + + public TestHarness consume(List inputs) throws Exception { + for (RowData rowData : inputs) { + this.pipeline.invoke(rowData); + } + return this; + } + + public TestHarness assertConsumeThrows(List inputs, String message) { + assertThrows(HoodieException.class, () -> consume(inputs), message); + return this; + } + + /** + * Assert the event buffer is empty. + */ + public TestHarness emptyEventBuffer() { + assertTrue( + this.pipeline.getEventBuffer().length == 1 + && this.pipeline.getEventBuffer()[0] == null, + "The coordinator events buffer expect to be empty"); + return this; + } + + /** + * Assert the next event exists and handle over it to the coordinator. + */ + public TestHarness assertNextEvent() { + final OperatorEvent nextEvent = this.pipeline.getNextEvent(); + MatcherAssert.assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); + this.pipeline.getCoordinator().handleEventFromOperator(0, nextEvent); + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Assert the next event exists and handle over it to the coordinator. + * + * @param numWriteStatus The expected write status num reported by the event + * @param partitions The written partitions reported by the event + */ + public TestHarness assertNextEvent(int numWriteStatus, String partitions) { + final OperatorEvent nextEvent = this.pipeline.getNextEvent(); + MatcherAssert.assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); + List writeStatuses = ((WriteMetadataEvent) nextEvent).getWriteStatuses(); + assertNotNull(writeStatuses); + MatcherAssert.assertThat(writeStatuses.size(), is(numWriteStatus)); + assertThat(writeStatuses.stream() + .map(WriteStatus::getPartitionPath).sorted(Comparator.naturalOrder()) + .collect(Collectors.joining(",")), + is(partitions)); + this.pipeline.getCoordinator().handleEventFromOperator(0, nextEvent); + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Assert the next event exists and handle over it to the coordinator. + * + *

Validates that the write metadata reported by the event is empty. + */ + public TestHarness assertEmptyEvent() { + final OperatorEvent nextEvent = this.pipeline.getNextEvent(); + MatcherAssert.assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); + List writeStatuses = ((WriteMetadataEvent) nextEvent).getWriteStatuses(); + assertNotNull(writeStatuses); + MatcherAssert.assertThat(writeStatuses.size(), is(0)); + this.pipeline.getCoordinator().handleEventFromOperator(0, nextEvent); + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Assert the data buffer with given number of buckets and records. + */ + public TestHarness assertDataBuffer(int numBuckets, int numRecords) { + Map> dataBuffer = this.pipeline.getDataBuffer(); + assertThat("Should have " + numBuckets + " data bucket", dataBuffer.size(), is(numBuckets)); + assertThat(numRecords + " records expect to flush out as a mini-batch", + dataBuffer.values().stream().findFirst().map(List::size).orElse(-1), + is(numRecords)); + return this; + } + + /** + * Checkpoints the pipeline, which triggers the data write and event send. + */ + public TestHarness checkpoint(long checkpointId) throws Exception { + this.pipeline.checkpointFunction(checkpointId); + return this; + } + + public TestHarness allDataFlushed() { + Map> dataBuffer = this.pipeline.getDataBuffer(); + assertThat("All data should be flushed out", dataBuffer.size(), is(0)); + return this; + } + + /** + * Handle the next {@code numEvents} events and handle over them to the coordinator. + */ + public TestHarness handleEvents(int numEvents) { + for (int i = 0; i < numEvents; i++) { + final OperatorEvent event = this.pipeline.getNextEvent(); // remove the first event first + assertThat("The operator expect to send an event", event, instanceOf(WriteMetadataEvent.class)); + this.pipeline.getCoordinator().handleEventFromOperator(0, event); + } + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Mark the checkpoint with id {@code checkpointId} as finished. + */ + public TestHarness checkpointComplete(long checkpointId) { + this.lastPending = lastPendingInstant(); + this.pipeline.checkpointComplete(checkpointId); + // started a new instant already + checkInflightInstant(); + checkInstantState(HoodieInstant.State.COMPLETED, lastPending); + this.lastComplete = lastPending; + this.lastPending = lastPendingInstant(); // refresh last pending instant + return this; + } + + /** + * Mark the checkpoint finished with empty write metadata. + */ + public TestHarness emptyCheckpoint(long checkpointId) { + String lastPending = lastPendingInstant(); + this.pipeline.checkpointComplete(checkpointId); + // last pending instant was reused + assertEquals(this.lastPending, lastPending); + checkInstantState(HoodieInstant.State.COMPLETED, lastComplete); + return this; + } + + /** + * Mark the checkpoint with id {@code checkpointId} as failed. + */ + public TestHarness checkpointFails(long checkpointId) { + this.pipeline.checkpointFails(checkpointId); + assertFalse(this.pipeline.getCoordinatorContext().isJobFailed(), + "The last checkpoint was aborted, ignore the events"); + // no complete instant + checkInstantState(HoodieInstant.State.COMPLETED, null); + return this; + } + + public TestHarness checkpointThrows(long checkpointId, String message) { + // this returns early because there is no inflight instant + assertThrows(HoodieException.class, () -> checkpoint(checkpointId), message); + return this; + } + + /** + * Mark the task with id {@code taskId} as failed. + */ + public TestHarness subTaskFails(int taskId) throws Exception { + // fails the subtask + String instant1 = lastPendingInstant(); + this.pipeline.subTaskFails(taskId); + + String instant2 = lastPendingInstant(); + assertNotEquals(instant2, instant1, "The previous instant should be rolled back when starting new instant"); + return this; + } + + public TestHarness noCompleteInstant() { + // no complete instant + checkInstantState(HoodieInstant.State.COMPLETED, null); + return this; + } + + /** + * Asserts the data files are empty. + */ + public TestHarness assertEmptyDataFiles() { + File[] dataFiles = baseFile.listFiles(file -> !file.getName().startsWith(".")); + assertNotNull(dataFiles); + assertThat(dataFiles.length, is(0)); + return this; + } + + public TestHarness checkWrittenData(Map expected) throws Exception { + checkWrittenData(expected, 4); + return this; + } + + public TestHarness checkWrittenData( + Map expected, + int partitions) throws Exception { + if (OptionsResolver.isCowTable(conf) || conf.getBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED)) { + TestData.checkWrittenData(this.baseFile, expected, partitions); + } else { + checkWrittenDataMor(baseFile, expected, partitions); + } + return this; + } + + private void checkWrittenDataMor(File baseFile, Map expected, int partitions) throws Exception { + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(basePath); + Schema schema = new TableSchemaResolver(metaClient).getTableAvroSchema(); + String latestInstant = lastCompleteInstant(); + FileSystem fs = FSUtils.getFs(basePath, new org.apache.hadoop.conf.Configuration()); + TestData.checkWrittenDataMOR(fs, latestInstant, baseFile, expected, partitions, schema); + } + + public TestHarness checkWrittenFullData(Map> expected) throws IOException { + TestData.checkWrittenFullData(this.baseFile, expected); + return this; + } + + public TestHarness checkWrittenAllData(Map expected, int partitions) throws IOException { + TestData.checkWrittenAllData(baseFile, expected, partitions); + return this; + } + + public TestHarness checkIndexLoaded(HoodieKey... keys) { + for (HoodieKey key : keys) { + assertTrue(this.pipeline.isKeyInState(key), + "Key: " + key + " assumes to be in the index state"); + } + return this; + } + + public TestHarness assertBootstrapped() throws Exception { + assertTrue(this.pipeline.isAlreadyBootstrap()); + return this; + } + + public TestHarness assertConfirming() { + assertTrue(this.pipeline.isConforming(), + "The write function should be waiting for the instant to commit"); + return this; + } + + public TestHarness assertNotConfirming() { + assertFalse(this.pipeline.isConforming(), + "The write function should finish waiting for the instant to commit"); + return this; + } + + public void end() throws Exception { + this.pipeline.close(); + } + + private String lastPendingInstant() { + return TestUtils.getLastPendingInstant(basePath); + } + + private void checkInflightInstant() { + final String instant = TestUtils.getLastPendingInstant(basePath); + assertNotNull(instant); + } + + private void checkInstantState(HoodieInstant.State state, String instantStr) { + final String instant; + switch (state) { + case REQUESTED: + instant = lastPendingInstant(); + break; + case COMPLETED: + instant = lastCompleteInstant(); + break; + default: + throw new AssertionError("Unexpected state"); + } + assertThat(instant, is(instantStr)); + } + + protected String lastCompleteInstant() { + return OptionsResolver.isMorTable(conf) + ? TestUtils.getLastDeltaCompleteInstant(basePath) + : TestUtils.getLastCompleteInstant(basePath); + } + } +} diff --git a/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java b/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java index f229f2de8a8f2..334df5961314d 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java +++ b/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java @@ -88,4 +88,18 @@ void testFileListingUsingMetadataNonPartitionedTable() throws Exception { assertThat(fileStatuses.length, is(1)); assertTrue(fileStatuses[0].getPath().toString().endsWith(HoodieFileFormat.PARQUET.getFileExtension())); } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testFileListingEmptyTable(boolean enableMetadata) { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setBoolean(FlinkOptions.METADATA_ENABLED, enableMetadata); + FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf); + List partitionKeys = Collections.singletonList("partition"); + List> partitions = fileIndex.getPartitions(partitionKeys, "default", false); + assertThat(partitions.size(), is(0)); + + FileStatus[] fileStatuses = fileIndex.getFilesInPartitions(); + assertThat(fileStatuses.length, is(0)); + } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadMonitoringFunction.java b/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadMonitoringFunction.java index d13f68319d9b4..541890f7b05f4 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadMonitoringFunction.java +++ b/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadMonitoringFunction.java @@ -89,7 +89,7 @@ public void testConsumeFromLatestCommit() throws Exception { assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit"); - String latestCommit = TestUtils.getLatestCommit(tempFile.getAbsolutePath()); + String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(latestCommit)), "All the splits should be with latestCommit instant time"); @@ -143,8 +143,8 @@ public void testConsumeFromSpecifiedCommit() throws Exception { // all the splits should come from the second commit. TestData.writeData(TestData.DATA_SET_INSERT, conf); TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); - String specifiedCommit = TestUtils.getLatestCommit(tempFile.getAbsolutePath()); - conf.setString(FlinkOptions.READ_STREAMING_START_COMMIT, specifiedCommit); + String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.READ_START_COMMIT, specifiedCommit); StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { harness.setup(); @@ -174,8 +174,8 @@ public void testConsumeFromEarliestCommit() throws Exception { // all the splits should come from the earliest commit. TestData.writeData(TestData.DATA_SET_INSERT, conf); TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); - String specifiedCommit = TestUtils.getLatestCommit(tempFile.getAbsolutePath()); - conf.setString(FlinkOptions.READ_STREAMING_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST); + String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST); StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { harness.setup(); diff --git a/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java b/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java index 233e6fa7eb04a..911c68511ccee 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java +++ b/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java @@ -22,7 +22,6 @@ import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.table.format.FilePathUtils; import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; import org.apache.hudi.table.format.mor.MergeOnReadTableState; @@ -45,7 +44,6 @@ import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -263,10 +261,8 @@ private OneInputStreamOperatorTestHarness create AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE).toString(), Collections.emptyList(), new String[0]); - Path[] paths = FilePathUtils.getReadPaths(new Path(basePath), conf, hadoopConf, partitionKeys); MergeOnReadInputFormat inputFormat = MergeOnReadInputFormat.builder() .config(conf) - .paths(FilePathUtils.toFlinkPaths(paths)) .tableState(hoodieTableState) .fieldTypes(rowDataType.getChildren()) .defaultPartName("default").limit(1000L) diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java b/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java index 5be603f7838e5..8e366bba4dfc9 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java @@ -18,6 +18,7 @@ package org.apache.hudi.table; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.configuration.FlinkOptions; @@ -79,8 +80,11 @@ void beforeEach() { streamTableEnv = TableEnvironmentImpl.create(settings); streamTableEnv.getConfig().getConfiguration() .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1); - streamTableEnv.getConfig().getConfiguration() - .setString("execution.checkpointing.interval", "2s"); + Configuration execConf = streamTableEnv.getConfig().getConfiguration(); + execConf.setString("execution.checkpointing.interval", "2s"); + // configure not to retry after failure + execConf.setString("restart-strategy", "fixed-delay"); + execConf.setString("restart-strategy.fixed-delay.attempts", "0"); settings = EnvironmentSettings.newInstance().inBatchMode().build(); batchTableEnv = TableEnvironmentImpl.create(settings); @@ -107,13 +111,13 @@ void testStreamWriteAndReadFromSpecifiedCommit(HoodieTableType tableType) throws String insertInto = "insert into t1 select * from source"; execInsertSql(streamTableEnv, insertInto); - String firstCommit = TestUtils.getFirstCommit(tempFile.getAbsolutePath()); + String firstCommit = TestUtils.getFirstCompleteInstant(tempFile.getAbsolutePath()); streamTableEnv.executeSql("drop table t1"); hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, true) .option(FlinkOptions.TABLE_TYPE, tableType) - .option(FlinkOptions.READ_STREAMING_START_COMMIT, firstCommit) + .option(FlinkOptions.READ_START_COMMIT, firstCommit) .end(); streamTableEnv.executeSql(hoodieTableDDL); List rows = execSelectSql(streamTableEnv, "select * from t1", 10); @@ -128,7 +132,7 @@ void testStreamWriteAndReadFromSpecifiedCommit(HoodieTableType tableType) throws .setBoolean("table.dynamic-table-options.enabled", true); // specify the start commit as earliest List rows3 = execSelectSql(streamTableEnv, - "select * from t1/*+options('read.streaming.start-commit'='earliest')*/", 10); + "select * from t1/*+options('read.start-commit'='earliest')*/", 10); assertRowsEquals(rows3, TestData.DATA_SET_SOURCE_INSERT); } @@ -177,7 +181,7 @@ void testStreamReadAppendData(HoodieTableType tableType) throws Exception { // execute 2 times execInsertSql(streamTableEnv, insertInto); // remember the commit - String specifiedCommit = TestUtils.getFirstCommit(tempFile.getAbsolutePath()); + String specifiedCommit = TestUtils.getFirstCompleteInstant(tempFile.getAbsolutePath()); // another update batch String insertInto2 = "insert into t1 select * from source2"; execInsertSql(streamTableEnv, insertInto2); @@ -186,7 +190,7 @@ void testStreamReadAppendData(HoodieTableType tableType) throws Exception { .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, true) .option(FlinkOptions.TABLE_TYPE, tableType) - .option(FlinkOptions.READ_STREAMING_START_COMMIT, specifiedCommit) + .option(FlinkOptions.READ_START_COMMIT, specifiedCommit) .end(); streamTableEnv.executeSql(createHoodieTable2); List rows = execSelectSql(streamTableEnv, "select * from t2", 10); @@ -260,8 +264,7 @@ void testStreamWriteWithCleaning() { Map options1 = new HashMap<>(defaultConf.toMap()); options1.put(FlinkOptions.TABLE_NAME.key(), "t1"); Configuration conf = Configuration.fromMap(options1); - HoodieTimeline timeline = StreamerUtil.createWriteClient(conf, null) - .getHoodieTable().getActiveTimeline(); + HoodieTimeline timeline = StreamerUtil.createMetaClient(conf).getActiveTimeline(); assertTrue(timeline.filterCompletedInstants() .getInstants().anyMatch(instant -> instant.getAction().equals("clean")), "some commits should be cleaned"); @@ -281,15 +284,14 @@ void testStreamReadWithDeletes() throws Exception { // write another commit with deletes TestData.writeData(TestData.DATA_SET_UPDATE_DELETE, conf); - String latestCommit = StreamerUtil.createWriteClient(conf, null) - .getLastCompletedInstant(HoodieTableType.MERGE_ON_READ); + String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) .option(FlinkOptions.READ_AS_STREAMING, true) .option(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2) - .option(FlinkOptions.READ_STREAMING_START_COMMIT, latestCommit) + .option(FlinkOptions.READ_START_COMMIT, latestCommit) .option(FlinkOptions.CHANGELOG_ENABLED, true) .end(); streamTableEnv.executeSql(hoodieTableDDL); @@ -343,13 +345,13 @@ void testStreamReadMorTableWithCompactionPlan() throws Exception { .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) .option(FlinkOptions.READ_AS_STREAMING, true) - .option(FlinkOptions.READ_STREAMING_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST) + .option(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST) .option(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2) // close the async compaction .option(FlinkOptions.COMPACTION_ASYNC_ENABLED, false) // generate compaction plan for each commit .option(FlinkOptions.COMPACTION_DELTA_COMMITS, 1) - .withPartition(false) + .noPartition() .end(); streamTableEnv.executeSql(hoodieTableDDL); @@ -392,6 +394,29 @@ void testWriteAndRead(ExecMode execMode, boolean hiveStylePartitioning) { + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); } + @ParameterizedTest + @MethodSource("tableTypeAndPartitioningParams") + void testWriteAndReadWithProctimeSequence(HoodieTableType tableType, boolean hiveStylePartitioning) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .field("uuid varchar(20)") + .field("name varchar(10)") + .field("age int") + .field("tss timestamp(3)") // use a different field with default precombine field 'ts' + .field("`partition` varchar(10)") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_SAME_KEY_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, "[+I[id1, Danny, 23, 1970-01-01T00:00:01, par1]]"); + } + @ParameterizedTest @EnumSource(value = HoodieTableType.class) void testBatchModeUpsertWithoutPartition(HoodieTableType tableType) { @@ -399,7 +424,7 @@ void testBatchModeUpsertWithoutPartition(HoodieTableType tableType) { String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.TABLE_NAME, tableType.name()) - .withPartition(false) + .noPartition() .end(); tableEnv.executeSql(hoodieTableDDL); @@ -529,12 +554,37 @@ void testInsertOverwrite(ExecMode execMode) { } @ParameterizedTest - @EnumSource(value = ExecMode.class) - void testUpsertWithMiniBatches(ExecMode execMode) { + @EnumSource(value = HoodieTableType.class) + void testStreamWriteAndReadWithMiniBatches(HoodieTableType tableType) throws Exception { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source", 4); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_START_COMMIT, "earliest") + .option(FlinkOptions.WRITE_BATCH_SIZE, 0.00001) + .noPartition() + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + // reading from the earliest commit instance. + List rows = execSelectSql(streamTableEnv, "select * from t1", 20); + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT); + } + + @ParameterizedTest + @MethodSource("executionModeAndTableTypeParams") + void testBatchUpsertWithMiniBatches(ExecMode execMode, HoodieTableType tableType) { TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv; String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.WRITE_BATCH_SIZE, "0.001") + .option(FlinkOptions.TABLE_TYPE, tableType) .end(); tableEnv.executeSql(hoodieTableDDL); @@ -556,6 +606,34 @@ void testUpsertWithMiniBatches(ExecMode execMode) { assertRowsEquals(result, "[+I[id1, Sophia, 18, 1970-01-01T00:00:05, par1]]"); } + @Test + void testUpdateWithDefaultHoodieRecordPayload() { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .field("id int") + .field("name string") + .field("price double") + .field("ts bigint") + .pkField("id") + .noPartition() + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.PAYLOAD_CLASS_NAME, DefaultHoodieRecordPayload.class.getName()) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + final String insertInto1 = "insert into t1 values\n" + + "(1,'a1',20,20)"; + execInsertSql(tableEnv, insertInto1); + + final String insertInto4 = "insert into t1 values\n" + + "(1,'a1',20,1)"; + execInsertSql(tableEnv, insertInto4); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, "[+I[1, a1, 20.0, 20]]"); + } + @ParameterizedTest @MethodSource("executionModeAndTableTypeParams") void testWriteNonPartitionedTable(ExecMode execMode, HoodieTableType tableType) { @@ -563,7 +641,7 @@ void testWriteNonPartitionedTable(ExecMode execMode, HoodieTableType tableType) String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.TABLE_TYPE, tableType) - .withPartition(false) + .noPartition() .end(); tableEnv.executeSql(hoodieTableDDL); @@ -595,7 +673,7 @@ void testWriteGlobalIndex() { String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.INDEX_GLOBAL_ENABLED, true) - .option(FlinkOptions.INSERT_DROP_DUPS, true) + .option(FlinkOptions.PRE_COMBINE, true) .end(); streamTableEnv.executeSql(hoodieTableDDL); @@ -617,7 +695,7 @@ void testWriteLocalIndex() { String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.INDEX_GLOBAL_ENABLED, false) - .option(FlinkOptions.INSERT_DROP_DUPS, true) + .option(FlinkOptions.PRE_COMBINE, true) .end(); streamTableEnv.executeSql(hoodieTableDDL); @@ -699,19 +777,17 @@ void testWriteAndReadDebeziumJson(ExecMode execMode) throws Exception { + " 'format' = 'debezium-json'\n" + ")"; streamTableEnv.executeSql(sourceDDL); - String hoodieTableDDL = "" - + "CREATE TABLE hoodie_sink(\n" - + " id INT NOT NULL,\n" - + " ts BIGINT,\n" - + " name STRING," - + " weight DOUBLE," - + " PRIMARY KEY (id) NOT ENFORCED" - + ") with (\n" - + " 'connector' = 'hudi',\n" - + " 'path' = '" + tempFile.getAbsolutePath() + "',\n" - + " 'read.streaming.enabled' = '" + (execMode == ExecMode.STREAM) + "',\n" - + " 'write.insert.drop.duplicates' = 'true'" - + ")"; + String hoodieTableDDL = sql("hoodie_sink") + .field("id INT NOT NULL") + .field("ts BIGINT") + .field("name STRING") + .field("weight DOUBLE") + .pkField("id") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, execMode == ExecMode.STREAM) + .option(FlinkOptions.PRE_COMBINE, true) + .noPartition() + .end(); streamTableEnv.executeSql(hoodieTableDDL); String insertInto = "insert into hoodie_sink select id, ts, name, weight from debezium_source"; execInsertSql(streamTableEnv, insertInto); @@ -770,7 +846,7 @@ void testBulkInsertNonPartitionedTable() { String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.OPERATION, "bulk_insert") - .withPartition(false) + .noPartition() .end(); tableEnv.executeSql(hoodieTableDDL); @@ -797,9 +873,10 @@ void testBulkInsertNonPartitionedTable() { + "+I[id1, Sophia, 18, 1970-01-01T00:00:05, par5]]", 3); } - @Test - void testAppendWrite() { - TableEnvironment tableEnv = batchTableEnv; + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testAppendWrite(boolean clustering) { + TableEnvironment tableEnv = streamTableEnv; // csv source String csvSourceDDL = TestConfigurations.getCsvSourceDDL("csv_source", "test_source_5.data"); tableEnv.executeSql(csvSourceDDL); @@ -807,7 +884,7 @@ void testAppendWrite() { String hoodieTableDDL = sql("hoodie_sink") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.OPERATION, "insert") - .option(FlinkOptions.INSERT_DEDUP, false) + .option(FlinkOptions.INSERT_CLUSTER, clustering) .end(); tableEnv.executeSql(hoodieTableDDL); @@ -854,6 +931,96 @@ void testWriteAndReadWithTimestampPartitioning(ExecMode execMode) { + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); } + @Test + void testWriteReadDecimals() { + TableEnvironment tableEnv = batchTableEnv; + String createTable = sql("decimals") + .field("f0 decimal(3, 2)") + .field("f1 decimal(10, 2)") + .field("f2 decimal(20, 2)") + .field("f3 decimal(38, 18)") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, "bulk_insert") + .option(FlinkOptions.PRECOMBINE_FIELD, "f1") + .pkField("f0") + .noPartition() + .end(); + tableEnv.executeSql(createTable); + + String insertInto = "insert into decimals values\n" + + "(1.23, 12345678.12, 12345.12, 123456789.12345)"; + execInsertSql(tableEnv, insertInto); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from decimals").execute().collect()); + assertRowsEquals(result1, "[+I[1.23, 12345678.12, 12345.12, 123456789.123450000000000000]]"); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testIncrementalRead(HoodieTableType tableType) throws Exception { + TableEnvironment tableEnv = batchTableEnv; + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, tableType.name()); + + // write 3 batches of data set + TestData.writeData(TestData.dataSetInsert(1, 2), conf); + TestData.writeData(TestData.dataSetInsert(3, 4), conf); + TestData.writeData(TestData.dataSetInsert(5, 6), conf); + + String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_START_COMMIT, latestCommit) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, TestData.dataSetInsert(5, 6)); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testReadWithWiderSchema(HoodieTableType tableType) throws Exception { + TableEnvironment tableEnv = batchTableEnv; + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, tableType.name()); + + // write a batch of data set + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + String hoodieTableDDL = sql("t1") + .field("uuid varchar(20)") + .field("name varchar(10)") + .field("age int") + .field("salary double") + .field("ts timestamp(3)") + .field("`partition` varchar(10)") + .pkField("uuid") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[id1, Danny, 23, null, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 33, null, 1970-01-01T00:00:00.002, par1], " + + "+I[id3, Julian, 53, null, 1970-01-01T00:00:00.003, par2], " + + "+I[id4, Fabian, 31, null, 1970-01-01T00:00:00.004, par2], " + + "+I[id5, Sophia, 18, null, 1970-01-01T00:00:00.005, par3], " + + "+I[id6, Emma, 20, null, 1970-01-01T00:00:00.006, par3], " + + "+I[id7, Bob, 44, null, 1970-01-01T00:00:00.007, par4], " + + "+I[id8, Han, 56, null, 1970-01-01T00:00:00.008, par4]]"; + assertRowsEquals(result, expected); + } + // ------------------------------------------------------------------------- // Utilities // ------------------------------------------------------------------------- @@ -906,7 +1073,7 @@ private void execInsertSql(TableEnvironment tEnv, String insert) { try { tableResult.getJobClient().get().getJobExecutionResult().get(); } catch (InterruptedException | ExecutionException ex) { - throw new RuntimeException(ex); + // ignored } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java index 1572dd446950f..cbdffe360fd2b 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java @@ -18,7 +18,10 @@ package org.apache.hudi.table; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.model.EventTimeAvroPayload; import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieValidationException; import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.ComplexAvroKeyGenerator; @@ -32,7 +35,6 @@ import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.api.Schema; -import org.apache.flink.table.api.ValidationException; import org.apache.flink.table.catalog.CatalogTable; import org.apache.flink.table.catalog.ObjectIdentifier; import org.apache.flink.table.catalog.ResolvedCatalogTable; @@ -84,39 +86,70 @@ void beforeEach() throws IOException { @Test void testRequiredOptionsForSource() { - // miss pk and pre combine key will throw exception + // miss pk and precombine key will throw exception ResolvedSchema schema1 = SchemaBuilder.instance() .field("f0", DataTypes.INT().notNull()) .field("f1", DataTypes.VARCHAR(20)) .field("f2", DataTypes.TIMESTAMP(3)) .build(); final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "f2"); - assertThrows(ValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext1)); - assertThrows(ValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext1)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext1)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext1)); - // given the pk and miss the pre combine key will throw exception + // a non-exists precombine key will throw exception ResolvedSchema schema2 = SchemaBuilder.instance() .field("f0", DataTypes.INT().notNull()) .field("f1", DataTypes.VARCHAR(20)) .field("f2", DataTypes.TIMESTAMP(3)) - .primaryKey("f0") .build(); + this.conf.setString(FlinkOptions.PRECOMBINE_FIELD, "non_exist_field"); final MockContext sourceContext2 = MockContext.getInstance(this.conf, schema2, "f2"); - assertThrows(ValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext2)); - assertThrows(ValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext2)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext2)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext2)); + this.conf.setString(FlinkOptions.PRECOMBINE_FIELD, FlinkOptions.PRECOMBINE_FIELD.defaultValue()); - // given pk and pre combine key will be ok + // given the pk but miss the pre combine key will be ok ResolvedSchema schema3 = SchemaBuilder.instance() .field("f0", DataTypes.INT().notNull()) .field("f1", DataTypes.VARCHAR(20)) .field("f2", DataTypes.TIMESTAMP(3)) - .field("ts", DataTypes.TIMESTAMP(3)) .primaryKey("f0") .build(); final MockContext sourceContext3 = MockContext.getInstance(this.conf, schema3, "f2"); + HoodieTableSource tableSource = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext3); + HoodieTableSink tableSink = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sourceContext3); + // the precombine field is overwritten + assertThat(tableSource.getConf().getString(FlinkOptions.PRECOMBINE_FIELD), is(FlinkOptions.NO_PRE_COMBINE)); + assertThat(tableSink.getConf().getString(FlinkOptions.PRECOMBINE_FIELD), is(FlinkOptions.NO_PRE_COMBINE)); + // precombine field not specified, use the default payload clazz + assertThat(tableSource.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(FlinkOptions.PAYLOAD_CLASS_NAME.defaultValue())); + assertThat(tableSink.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(FlinkOptions.PAYLOAD_CLASS_NAME.defaultValue())); + + // given pk but miss the pre combine key with DefaultHoodieRecordPayload should throw + this.conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, DefaultHoodieRecordPayload.class.getName()); + final MockContext sourceContext4 = MockContext.getInstance(this.conf, schema3, "f2"); + + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext4)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext4)); + this.conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, FlinkOptions.PAYLOAD_CLASS_NAME.defaultValue()); - assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSource(sourceContext3)); - assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSink(sourceContext3)); + // given pk and pre combine key will be ok + ResolvedSchema schema4 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + final MockContext sourceContext5 = MockContext.getInstance(this.conf, schema4, "f2"); + + assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSource(sourceContext5)); + assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSink(sourceContext5)); + // precombine field specified(default ts), use DefaultHoodieRecordPayload as payload clazz + HoodieTableSource tableSource5 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext5); + HoodieTableSink tableSink5 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sourceContext5); + assertThat(tableSource5.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(EventTimeAvroPayload.class.getName())); + assertThat(tableSink5.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(EventTimeAvroPayload.class.getName())); } @Test @@ -231,6 +264,32 @@ void testSetupCleaningOptionsForSource() { assertThat(conf2.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS), is(45)); } + @Test + void testSetupReadOptionsForSource() { + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + // set up new retains commits that is less than min archive commits + this.conf.setString(FlinkOptions.READ_END_COMMIT, "123"); + + final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource1 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext1); + final Configuration conf1 = tableSource1.getConf(); + assertThat(conf1.getString(FlinkOptions.QUERY_TYPE), is(FlinkOptions.QUERY_TYPE_INCREMENTAL)); + + this.conf.removeConfig(FlinkOptions.READ_END_COMMIT); + this.conf.setString(FlinkOptions.READ_START_COMMIT, "123"); + final MockContext sourceContext2 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource2 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext2); + final Configuration conf2 = tableSource2.getConf(); + assertThat(conf2.getString(FlinkOptions.QUERY_TYPE), is(FlinkOptions.QUERY_TYPE_INCREMENTAL)); + } + @Test void testInferAvroSchemaForSink() { // infer the schema if not specified @@ -368,6 +427,21 @@ void testSetupTimestampBasedKeyGenForSink() { is("UTC")); } + @Test + void testSetupWriteOptionsForSink() { + final HoodieTableSink tableSink1 = + (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(MockContext.getInstance(this.conf)); + final Configuration conf1 = tableSink1.getConf(); + assertThat(conf1.get(FlinkOptions.PRE_COMBINE), is(true)); + + // set up operation as 'insert' + this.conf.setString(FlinkOptions.OPERATION, "insert"); + HoodieTableSink tableSink2 = + (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(MockContext.getInstance(this.conf)); + Configuration conf2 = tableSink2.getConf(); + assertThat(conf2.get(FlinkOptions.PRE_COMBINE), is(false)); + } + // ------------------------------------------------------------------------- // Inner Class // ------------------------------------------------------------------------- diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableSource.java b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableSource.java index d50a716cf741c..8ee18a9601b2f 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableSource.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableSource.java @@ -19,7 +19,6 @@ package org.apache.hudi.table; import org.apache.hudi.configuration.FlinkOptions; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; import org.apache.hudi.utils.TestConfigurations; import org.apache.hudi.utils.TestData; @@ -31,6 +30,7 @@ import org.apache.flink.table.data.RowData; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.function.ThrowingSupplier; import org.junit.jupiter.api.io.TempDir; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,9 +46,9 @@ import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.core.Is.is; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; /** * Test cases for HoodieTableSource. @@ -112,9 +112,9 @@ void testGetInputFormat() throws Exception { inputFormat = tableSource.getInputFormat(); assertThat(inputFormat, is(instanceOf(MergeOnReadInputFormat.class))); conf.setString(FlinkOptions.QUERY_TYPE.key(), FlinkOptions.QUERY_TYPE_INCREMENTAL); - assertThrows(HoodieException.class, - () -> tableSource.getInputFormat(), - "Invalid query type : 'incremental'. Only 'snapshot' is supported now"); + assertDoesNotThrow( + (ThrowingSupplier>) tableSource::getInputFormat, + "Query type: 'incremental' should be supported"); } @Test diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/format/TestInputFormat.java b/hudi-flink/src/test/java/org/apache/hudi/table/format/TestInputFormat.java index f83b2d991c1ea..f4da947f3bfc2 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/format/TestInputFormat.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/format/TestInputFormat.java @@ -19,10 +19,13 @@ package org.apache.hudi.table.format; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.table.HoodieTableSource; import org.apache.hudi.table.format.cow.CopyOnWriteInputFormat; import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; +import org.apache.hudi.util.AvroSchemaConverter; import org.apache.hudi.util.StreamerUtil; import org.apache.hudi.utils.TestConfigurations; import org.apache.hudi.utils.TestData; @@ -44,6 +47,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.CoreMatchers.is; @@ -71,12 +75,7 @@ void beforeEach(HoodieTableType tableType, Map options) throws I options.forEach((key, value) -> conf.setString(key, value)); StreamerUtil.initTableIfNotExists(conf); - this.tableSource = new HoodieTableSource( - TestConfigurations.TABLE_SCHEMA, - new Path(tempFile.getAbsolutePath()), - Collections.singletonList("partition"), - "default", - conf); + this.tableSource = getTableSource(conf); } @ParameterizedTest @@ -385,10 +384,95 @@ void testReadChangesUnMergedMOR() throws Exception { assertThat(actual, is(expected)); } + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testReadIncrementally(HoodieTableType tableType) throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.QUERY_TYPE.key(), FlinkOptions.QUERY_TYPE_INCREMENTAL); + beforeEach(tableType, options); + + // write another commit to read again + for (int i = 0; i < 6; i += 2) { + List dataset = TestData.dataSetInsert(i + 1, i + 2); + TestData.writeData(dataset, conf); + } + + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(tempFile.getAbsolutePath()); + List commits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + + assertThat(commits.size(), is(3)); + + // only the start commit + conf.setString(FlinkOptions.READ_START_COMMIT, commits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat1 = this.tableSource.getInputFormat(); + assertThat(inputFormat1, instanceOf(MergeOnReadInputFormat.class)); + + List actual1 = readData(inputFormat1); + final List expected1 = TestData.dataSetInsert(3, 4, 5, 6); + TestData.assertRowDataEquals(actual1, expected1); + + // only the start commit: earliest + conf.setString(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST); + this.tableSource = getTableSource(conf); + InputFormat inputFormat2 = this.tableSource.getInputFormat(); + assertThat(inputFormat2, instanceOf(MergeOnReadInputFormat.class)); + + List actual2 = readData(inputFormat2); + final List expected2 = TestData.dataSetInsert(1, 2, 3, 4, 5, 6); + TestData.assertRowDataEquals(actual2, expected2); + + // start and end commit: [start commit, end commit] + conf.setString(FlinkOptions.READ_START_COMMIT, commits.get(0)); + conf.setString(FlinkOptions.READ_END_COMMIT, commits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat3 = this.tableSource.getInputFormat(); + assertThat(inputFormat3, instanceOf(MergeOnReadInputFormat.class)); + + List actual3 = readData(inputFormat3); + final List expected3 = TestData.dataSetInsert(1, 2, 3, 4); + TestData.assertRowDataEquals(actual3, expected3); + + // only the end commit: point in time query + conf.removeConfig(FlinkOptions.READ_START_COMMIT); + conf.setString(FlinkOptions.READ_END_COMMIT, commits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat4 = this.tableSource.getInputFormat(); + assertThat(inputFormat4, instanceOf(MergeOnReadInputFormat.class)); + + List actual4 = readData(inputFormat4); + final List expected4 = TestData.dataSetInsert(3, 4); + TestData.assertRowDataEquals(actual4, expected4); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testReadWithWiderSchema(HoodieTableType tableType) throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.SOURCE_AVRO_SCHEMA.key(), + AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE_WIDER).toString()); + beforeEach(tableType, options); + + TestData.writeData(TestData.DATA_SET_INSERT, conf); + InputFormat inputFormat = this.tableSource.getInputFormat(); + List result = readData(inputFormat); + TestData.assertRowDataEquals(result, TestData.DATA_SET_INSERT); + } + // ------------------------------------------------------------------------- // Utilities // ------------------------------------------------------------------------- + private HoodieTableSource getTableSource(Configuration conf) { + return new HoodieTableSource( + TestConfigurations.TABLE_SCHEMA, + new Path(tempFile.getAbsolutePath()), + Collections.singletonList("partition"), + "default", + conf); + } + @SuppressWarnings("unchecked, rawtypes") private static List readData(InputFormat inputFormat) throws IOException { InputSplit[] inputSplits = inputFormat.createInputSplits(1); diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestCompactionUtil.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestCompactionUtil.java index 9bd03e115eb81..a5fed83ea15de 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/TestCompactionUtil.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestCompactionUtil.java @@ -20,7 +20,6 @@ import org.apache.hudi.avro.model.HoodieCompactionOperation; import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -30,38 +29,90 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.HoodieFlinkTable; import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkTables; import org.apache.hudi.util.StreamerUtil; import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import java.io.File; import java.io.IOException; import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test cases for {@link org.apache.hudi.util.CompactionUtil}. */ public class TestCompactionUtil { + private HoodieFlinkTable table; + private HoodieTableMetaClient metaClient; + private Configuration conf; + @TempDir File tempFile; + @BeforeEach + void beforeEach() throws IOException { + this.conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + StreamerUtil.initTableIfNotExists(conf); + this.table = FlinkTables.createTable(conf); + this.metaClient = table.getMetaClient(); + } + @Test - void rollbackCompaction() throws IOException { - Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); - conf.setInteger(FlinkOptions.COMPACTION_DELTA_SECONDS, 0); + void rollbackCompaction() { + List oriInstants = IntStream.range(0, 3) + .mapToObj(i -> generateCompactionPlan()).collect(Collectors.toList()); + List instants = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline() + .filter(instant -> instant.getState() == HoodieInstant.State.INFLIGHT) + .getInstants() + .collect(Collectors.toList()); + assertThat("all the instants should be in pending state", instants.size(), is(3)); + CompactionUtil.rollbackCompaction(table); + boolean allRolledBack = metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants() + .allMatch(instant -> instant.getState() == HoodieInstant.State.REQUESTED); + assertTrue(allRolledBack, "all the instants should be rolled back"); + List actualInstants = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + assertThat(actualInstants, is(oriInstants)); + } - StreamerUtil.initTableIfNotExists(conf); + @Test + void rollbackEarliestCompaction() { + conf.setInteger(FlinkOptions.COMPACTION_TIMEOUT_SECONDS, 0); + List oriInstants = IntStream.range(0, 3) + .mapToObj(i -> generateCompactionPlan()).collect(Collectors.toList()); + List instants = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline() + .filter(instant -> instant.getState() == HoodieInstant.State.INFLIGHT) + .getInstants() + .collect(Collectors.toList()); + assertThat("all the instants should be in pending state", instants.size(), is(3)); + CompactionUtil.rollbackEarliestCompaction(table, conf); + long requestedCnt = metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants() + .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED).count(); + assertThat("Only the first instant expects to be rolled back", requestedCnt, is(1L)); - HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf, null); - HoodieFlinkTable table = writeClient.getHoodieTable(); - HoodieTableMetaClient metaClient = table.getMetaClient(); + String instantTime = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline().filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED) + .firstInstant().get().getTimestamp(); + assertThat(instantTime, is(oriInstants.get(0))); + } + /** + * Generates a compaction plan on the timeline and returns its instant time. + */ + private String generateCompactionPlan() { HoodieCompactionOperation operation = new HoodieCompactionOperation(); HoodieCompactionPlan plan = new HoodieCompactionPlan(Collections.singletonList(operation), Collections.emptyMap(), 1); String instantTime = HoodieActiveTimeline.createNewInstantTime(); @@ -75,13 +126,7 @@ void rollbackCompaction() throws IOException { throw new HoodieIOException("Exception scheduling compaction", ioe); } metaClient.reloadActiveTimeline(); - HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().orElse(null); - assertThat(instant.getTimestamp(), is(instantTime)); - - CompactionUtil.rollbackCompaction(table, writeClient, conf); - HoodieInstant rollbackInstant = table.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get(); - assertThat(rollbackInstant.getState(), is(HoodieInstant.State.REQUESTED)); - assertThat(rollbackInstant.getTimestamp(), is(instantTime)); + return instantTime; } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java index b66d55a77c7fa..0eafb1281ff4b 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java @@ -32,9 +32,12 @@ import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.RowType; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.stream.Collectors; /** * Configurations for the test. @@ -57,6 +60,20 @@ private TestConfigurations() { .fields(ROW_TYPE.getFieldNames(), ROW_DATA_TYPE.getChildren()) .build(); + private static final List FIELDS = ROW_TYPE.getFields().stream() + .map(RowType.RowField::asSummaryString).collect(Collectors.toList()); + + public static final DataType ROW_DATA_TYPE_WIDER = DataTypes.ROW( + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("salary", DataTypes.DOUBLE()), + DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field + DataTypes.FIELD("partition", DataTypes.VARCHAR(10))) + .notNull(); + + public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType(); + public static String getCreateHoodieTableDDL(String tableName, Map options) { return getCreateHoodieTableDDL(tableName, options, true, "partition"); } @@ -66,20 +83,29 @@ public static String getCreateHoodieTableDDL( Map options, boolean havePartition, String partitionField) { + return getCreateHoodieTableDDL(tableName, FIELDS, options, havePartition, "uuid", partitionField); + } + + public static String getCreateHoodieTableDDL( + String tableName, + List fields, + Map options, + boolean havePartition, + String pkField, + String partitionField) { StringBuilder builder = new StringBuilder(); - builder.append("create table " + tableName + "(\n" - + " uuid varchar(20),\n" - + " name varchar(10),\n" - + " age int,\n" - + " ts timestamp(3),\n" - + " `partition` varchar(20),\n" - + " PRIMARY KEY(uuid) NOT ENFORCED\n" - + ")\n"); + builder.append("create table ").append(tableName).append("(\n"); + for (String field : fields) { + builder.append(" ").append(field).append(",\n"); + } + builder.append(" PRIMARY KEY(").append(pkField).append(") NOT ENFORCED\n") + .append(")\n"); if (havePartition) { builder.append("PARTITIONED BY (`").append(partitionField).append("`)\n"); } + final String connector = options.computeIfAbsent("connector", k -> "hudi"); builder.append("with (\n" - + " 'connector' = 'hudi'"); + + " 'connector' = '").append(connector).append("'"); options.forEach((k, v) -> builder.append(",\n") .append(" '").append(k).append("' = '").append(v).append("'")); builder.append("\n)"); @@ -205,8 +231,10 @@ public static Sql sql(String tableName) { */ public static class Sql { private final Map options; - private String tableName; + private final String tableName; + private List fields = new ArrayList<>(); private boolean withPartition = true; + private String pkField = "uuid"; private String partitionField = "partition"; public Sql(String tableName) { @@ -219,8 +247,18 @@ public Sql option(ConfigOption option, Object val) { return this; } - public Sql withPartition(boolean withPartition) { - this.withPartition = withPartition; + public Sql options(Map options) { + this.options.putAll(options); + return this; + } + + public Sql noPartition() { + this.withPartition = false; + return this; + } + + public Sql pkField(String pkField) { + this.pkField = pkField; return this; } @@ -229,8 +267,17 @@ public Sql partitionField(String partitionField) { return this; } + public Sql field(String fieldSchema) { + fields.add(fieldSchema); + return this; + } + public String end() { - return TestConfigurations.getCreateHoodieTableDDL(this.tableName, options, this.withPartition, this.partitionField); + if (this.fields.size() == 0) { + this.fields = FIELDS; + } + return TestConfigurations.getCreateHoodieTableDDL(this.tableName, this.fields, options, + this.withPartition, this.pkField, this.partitionField); } } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java index 3e0afc25a0dbc..e8e177b823626 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java @@ -18,7 +18,6 @@ package org.apache.hudi.utils; -import org.apache.hudi.client.FlinkTaskContextSupplier; import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; @@ -284,6 +283,14 @@ public class TestData { TimestampData.fromEpochMillis(2), StringData.fromString("par1")) ); + public static List dataSetInsert(int... ids) { + List inserts = new ArrayList<>(); + Arrays.stream(ids).forEach(i -> inserts.add( + insertRow(StringData.fromString("id" + i), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(i), StringData.fromString("par1")))); + return inserts; + } + private static Integer toIdSafely(Object id) { if (id == null) { return -1; @@ -424,7 +431,7 @@ public static void assertRowDataEquals(List rows, String expected) { */ public static void assertRowDataEquals(List rows, List expected) { String rowsString = rowDataToString(rows); - assertThat(rowDataToString(expected), is(rowsString)); + assertThat(rowsString, is(rowDataToString(expected))); } /** @@ -535,17 +542,15 @@ public static void checkWrittenFullData( // 1. init flink table HoodieTableMetaClient metaClient = HoodieTestUtils.init(basePath.getAbsolutePath()); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath.getAbsolutePath()).build(); - FlinkTaskContextSupplier supplier = new FlinkTaskContextSupplier(null); - HoodieFlinkEngineContext context = new HoodieFlinkEngineContext(supplier); - HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient); + HoodieFlinkTable table = HoodieFlinkTable.create(config, HoodieFlinkEngineContext.DEFAULT, metaClient); // 2. check each partition data expected.forEach((partition, partitionDataSet) -> { List readBuffer = new ArrayList<>(); - table.getFileSystemView().getAllFileGroups(partition) - .forEach(v -> v.getLatestDataFile().ifPresent(baseFile -> { + table.getBaseFileOnlyView().getLatestBaseFiles(partition) + .forEach(baseFile -> { String path = baseFile.getPath(); try { ParquetReader reader = AvroParquetReader.builder(new Path(path)).build(); @@ -557,7 +562,7 @@ public static void checkWrittenFullData( } catch (IOException e) { throw new RuntimeException(e); } - })); + }); assertTrue(partitionDataSet.size() == readBuffer.size() && partitionDataSet.containsAll(readBuffer)); diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java index 8822a6f79b7ad..9dc78aa4cf273 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java @@ -35,6 +35,13 @@ private TestSQL() { + "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n" + "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')"; + public static final String INSERT_SAME_KEY_T1 = "insert into t1 values\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:05','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:04','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:03','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:02','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')"; + public static final String UPDATE_INSERT_T1 = "insert into t1 values\n" + "('id1','Danny',24,TIMESTAMP '1970-01-01 00:00:01','par1'),\n" + "('id2','Stephen',34,TIMESTAMP '1970-01-01 00:00:02','par1'),\n" diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestUtils.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestUtils.java index 4e9ad5123e820..92e16cd1059bf 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/TestUtils.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestUtils.java @@ -19,6 +19,8 @@ package org.apache.hudi.utils; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.source.StreamReadMonitoringFunction; import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; @@ -27,37 +29,50 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.core.fs.Path; -import java.io.File; -import java.util.Collections; - import static org.junit.jupiter.api.Assertions.assertTrue; /** * Common test utils. */ public class TestUtils { + public static String getLastPendingInstant(String basePath) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(StreamerUtil.getHadoopConf()).setBasePath(basePath).build(); + return StreamerUtil.getLastPendingInstant(metaClient); + } + + public static String getLastCompleteInstant(String basePath) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(StreamerUtil.getHadoopConf()).setBasePath(basePath).build(); + return StreamerUtil.getLastCompletedInstant(metaClient); + } - public static String getLatestCommit(String basePath) { + public static String getLastDeltaCompleteInstant(String basePath) { final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() .setConf(StreamerUtil.getHadoopConf()).setBasePath(basePath).build(); - return metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants().lastInstant().get().getTimestamp(); + return metaClient.getCommitsTimeline().filterCompletedInstants() + .filter(hoodieInstant -> hoodieInstant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) + .lastInstant() + .map(HoodieInstant::getTimestamp) + .orElse(null); } - public static String getFirstCommit(String basePath) { + public static String getFirstCompleteInstant(String basePath) { final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() .setConf(StreamerUtil.getHadoopConf()).setBasePath(basePath).build(); - return metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants().firstInstant().get().getTimestamp(); + return metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants().firstInstant() + .map(HoodieInstant::getTimestamp).orElse(null); } public static String getSplitPartitionPath(MergeOnReadInputSplit split) { assertTrue(split.getLogPaths().isPresent()); final String logPath = split.getLogPaths().get().get(0); - String[] paths = logPath.split(File.separator); + String[] paths = logPath.split(Path.SEPARATOR); return paths[paths.length - 2]; } public static StreamReadMonitoringFunction getMonitorFunc(Configuration conf) { final String basePath = conf.getString(FlinkOptions.PATH); - return new StreamReadMonitoringFunction(conf, new Path(basePath), 1024 * 1024L, Collections.emptySet()); + return new StreamReadMonitoringFunction(conf, new Path(basePath), 1024 * 1024L, null); } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestViewStorageProperties.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestViewStorageProperties.java new file mode 100644 index 0000000000000..f80760bf1fd85 --- /dev/null +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestViewStorageProperties.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.util.ViewStorageProperties; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for {@link ViewStorageProperties}. + */ +public class TestViewStorageProperties { + @TempDir + File tempFile; + + @Test + void testReadWriteProperties() throws IOException { + String basePath = tempFile.getAbsolutePath(); + FileSystemViewStorageConfig config = FileSystemViewStorageConfig.newBuilder() + .withStorageType(FileSystemViewStorageType.SPILLABLE_DISK) + .withRemoteServerHost("host1") + .withRemoteServerPort(1234).build(); + ViewStorageProperties.createProperties(basePath, config); + ViewStorageProperties.createProperties(basePath, config); + ViewStorageProperties.createProperties(basePath, config); + + FileSystemViewStorageConfig readConfig = ViewStorageProperties.loadFromProperties(basePath); + assertThat(readConfig.getStorageType(), is(FileSystemViewStorageType.SPILLABLE_DISK)); + assertThat(readConfig.getRemoteViewServerHost(), is("host1")); + assertThat(readConfig.getRemoteViewServerPort(), is(1234)); + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java index b7141a8ee762f..90d53a6cd9cde 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java @@ -39,7 +39,7 @@ import java.util.stream.IntStream; /** - * Utility funcitons copied from Hive ColumnProjectionUtils.java. + * Utility functions copied from Hive ColumnProjectionUtils.java. * Needed to copy as we see NoSuchMethod errors when directly using these APIs with/without Spark. * Some of these methods are not available across hive versions. */ diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java index d94018b88546f..c797f59efc035 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java @@ -78,6 +78,11 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial */ Map metaClientCache; + /** + * HoodieTableFileSystemView Cache. + */ + private Map hoodieTableFileSystemViewCache; + /** * Hadoop configurations for the FileSystem. */ @@ -97,6 +102,7 @@ public HoodieROTablePathFilter(Configuration conf) { this.nonHoodiePathCache = new HashSet<>(); this.conf = new SerializableConfiguration(conf); this.metaClientCache = new HashMap<>(); + this.hoodieTableFileSystemViewCache = new HashMap<>(); } /** @@ -175,8 +181,15 @@ public boolean accept(Path path) { metaClientCache.put(baseDir.toString(), metaClient); } - fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, - metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf())); + HoodieTableMetaClient finalMetaClient = metaClient; + fsView = hoodieTableFileSystemViewCache.computeIfAbsent(baseDir.toString(), key -> + FileSystemViewManager.createInMemoryFileSystemView( + engineContext, + finalMetaClient, + HoodieInputFormatUtils.buildMetadataConfig(getConf()) + ) + ); + String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder); List latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList()); // populate the cache @@ -202,10 +215,6 @@ public boolean accept(Path path) { } nonHoodiePathCache.add(folder.toString()); return true; - } finally { - if (fsView != null) { - fsView.close(); - } } } else { // files is at < 3 level depth in FS tree, can't be hoodie dataset diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java index 914e698fa5adb..7fecd57927050 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java @@ -20,7 +20,6 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit; -import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; import org.apache.hadoop.hive.ql.io.IOContextMap; import org.apache.hadoop.io.ArrayWritable; @@ -66,8 +65,6 @@ public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split @Override public boolean next(NullWritable key, ArrayWritable value) throws IOException { if (this.currentRecordReader.next(key, value)) { - LOG.info("Reading from record reader"); - LOG.info(HoodieRealtimeRecordReaderUtils.arrayWritableToString(value)); return true; } else if (recordReaders.size() > 0) { this.currentRecordReader.close(); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java index 63728e38f1c7a..6f92359b2b613 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java @@ -91,7 +91,6 @@ public RecordReader getRecordReader(final InputSpli // For e:g _hoodie_record_key would be missing and merge step would throw exceptions. // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction // time. - HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf, Option.empty()); this.conf = jobConf; @@ -99,6 +98,7 @@ public RecordReader getRecordReader(final InputSpli } } } + HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index 028641c62b96b..af68dff6aaec4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -98,7 +98,6 @@ void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf job // For e:g _hoodie_record_key would be missing and merge step would throw exceptions. // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction // time. - HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); if (!realtimeSplit.getDeltaLogPaths().isEmpty()) { HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf, realtimeSplit.getHoodieVirtualKeyInfo()); } @@ -107,6 +106,7 @@ void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf job } } } + HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); } @Override diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java index 70c43011b0fba..c5b97f99f83af 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java @@ -25,7 +25,6 @@ import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodiePartitionMetadata; -import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -72,7 +71,6 @@ import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE; -import static org.apache.hudi.common.config.HoodieMetadataConfig.VALIDATE_ENABLE; public class HoodieInputFormatUtils { @@ -419,7 +417,6 @@ public static Map> groupSnapshotPathsByMetaCli public static HoodieMetadataConfig buildMetadataConfig(Configuration conf) { return HoodieMetadataConfig.newBuilder() .enable(conf.getBoolean(ENABLE.key(), DEFAULT_METADATA_ENABLE_FOR_READERS)) - .validate(conf.getBoolean(VALIDATE_ENABLE.key(), VALIDATE_ENABLE.defaultValue())) .build(); } @@ -489,43 +486,50 @@ private static HoodieBaseFile refreshFileStatus(Configuration conf, HoodieBaseFi } /** - * Iterate through a list of commits in ascending order, and extract the file status of - * all affected files from the commits metadata grouping by partition path. If the files has + * Iterate through a list of commit metadata in natural order, and extract the file status of + * all affected files from the commits metadata grouping by file full path. If the files has * been touched multiple times in the given commits, the return value will keep the one * from the latest commit. - * @param basePath - * @param commitsToCheck - * @param timeline - * @return HashMap> - * @throws IOException + * + * @param basePath The table base path + * @param metadataList The metadata list to read the data from + * + * @return the affected file status array */ - public static HashMap> listAffectedFilesForCommits( - Path basePath, List commitsToCheck, HoodieTimeline timeline) throws IOException { + public static FileStatus[] listAffectedFilesForCommits(Path basePath, List metadataList) { // TODO: Use HoodieMetaTable to extract affected file directly. - HashMap> partitionToFileStatusesMap = new HashMap<>(); - List sortedCommitsToCheck = new ArrayList<>(commitsToCheck); - sortedCommitsToCheck.sort(HoodieInstant::compareTo); + HashMap fullPathToFileStatus = new HashMap<>(); // Iterate through the given commits. - for (HoodieInstant commit: sortedCommitsToCheck) { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(), - HoodieCommitMetadata.class); - // Iterate through all the affected partitions of a commit. - for (Map.Entry> entry: commitMetadata.getPartitionToWriteStats().entrySet()) { - if (!partitionToFileStatusesMap.containsKey(entry.getKey())) { - partitionToFileStatusesMap.put(entry.getKey(), new HashMap<>()); - } - // Iterate through all the written files of this partition. - for (HoodieWriteStat stat : entry.getValue()) { - String relativeFilePath = stat.getPath(); - Path fullPath = relativeFilePath != null ? FSUtils.getPartitionPath(basePath, relativeFilePath) : null; - if (fullPath != null) { - FileStatus fs = new FileStatus(stat.getFileSizeInBytes(), false, 0, 0, - 0, fullPath); - partitionToFileStatusesMap.get(entry.getKey()).put(fullPath.getName(), fs); - } - } - } + for (HoodieCommitMetadata metadata: metadataList) { + fullPathToFileStatus.putAll(metadata.getFullPathToFileStatus(basePath.toString())); } - return partitionToFileStatusesMap; + return fullPathToFileStatus.values().toArray(new FileStatus[0]); + } + + /** + * Returns all the incremental write partition paths as a set with the given commits metadata. + * + * @param metadataList The commits metadata + * @return the partition path set + */ + public static Set getWritePartitionPaths(List metadataList) { + return metadataList.stream() + .map(HoodieCommitMetadata::getWritePartitionPaths) + .flatMap(Collection::stream) + .collect(Collectors.toSet()); + } + + /** + * Returns the commit metadata of the given instant. + * + * @param instant The hoodie instant + * @param timeline The timeline + * @return the commit metadata + */ + public static HoodieCommitMetadata getCommitMetadata( + HoodieInstant instant, + HoodieTimeline timeline) throws IOException { + byte[] data = timeline.getInstantDetails(instant).get(); + return HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); } } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java index a647da9b9b99f..1771db056cfa2 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.testutils.FileCreateUtils; import org.apache.hudi.common.testutils.HoodieTestUtils; @@ -137,6 +138,24 @@ protected Properties getPropertiesForKeyGen() { public void testReader(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled, boolean partitioned) throws Exception { + testReaderInternal(diskMapType, isCompressionEnabled, partitioned); + } + + @Test + public void testHFileInlineReader() throws Exception { + testReaderInternal(ExternalSpillableMap.DiskMapType.BITCASK, false, false, + HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK); + } + + private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean partitioned) throws Exception { + testReaderInternal(diskMapType, isCompressionEnabled, partitioned, HoodieLogBlock.HoodieLogBlockType.AVRO_DATA_BLOCK); + } + + private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean partitioned, HoodieLogBlock.HoodieLogBlockType logBlockType) throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); @@ -175,7 +194,7 @@ public void testReader(ExternalSpillableMap.DiskMapType diskMapType, } else { writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", baseInstant, - instantTime, 120, 0, logVersion); + instantTime, 120, 0, logVersion, logBlockType); } long size = writer.getCurrentSize(); writer.close(); diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java index d10ccfca91594..13d921979c70a 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java @@ -27,6 +27,8 @@ import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; import org.apache.hudi.common.table.log.block.HoodieCommandBlock; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.SchemaTestUtil; @@ -301,7 +303,14 @@ public static HoodieLogFormat.Writer writeRollback(File partitionDir, FileSystem public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, FileSystem fs, Schema schema, String fileId, - String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion) + String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion) throws IOException, InterruptedException { + return writeDataBlockToLogFile(partitionDir, fs, schema, fileId, baseCommit, newCommit, numberOfRecords, offset, logVersion, HoodieLogBlock.HoodieLogBlockType.AVRO_DATA_BLOCK); + } + + public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, FileSystem fs, Schema schema, String + fileId, + String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion, + HoodieLogBlock.HoodieLogBlockType logBlockType) throws InterruptedException, IOException { HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId).withLogVersion(logVersion) @@ -314,7 +323,8 @@ public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, newCommit); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchema.toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); + HoodieDataBlock dataBlock = (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK) ? new HoodieHFileDataBlock(records, header) : + new HoodieAvroDataBlock(records, header); writer.appendBlock(dataBlock); return writer; } diff --git a/hudi-integ-test/README.md b/hudi-integ-test/README.md index 4a9e9bc674b5f..ffdedf849298e 100644 --- a/hudi-integ-test/README.md +++ b/hudi-integ-test/README.md @@ -177,7 +177,7 @@ cd /opt Copy the integration tests jar into the docker container ``` -docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt ``` ``` @@ -214,21 +214,29 @@ spark-submit \ --conf spark.network.timeout=600s \ --conf spark.yarn.max.executor.failures=10 \ --conf spark.sql.catalogImplementation=hive \ +--conf spark.driver.extraClassPath=/var/demo/jars/* \ +--conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ --target-table table1 \ --props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \ ---schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ --workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type COPY_ON_WRITE \ ---compact-scheduling-minshare 1 +--compact-scheduling-minshare 1 \ +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +--clean-input \ +--clean-output ``` Or a Merge-on-Read job: @@ -253,23 +261,44 @@ spark-submit \ --conf spark.network.timeout=600s \ --conf spark.yarn.max.executor.failures=10 \ --conf spark.sql.catalogImplementation=hive \ +--conf spark.driver.extraClassPath=/var/demo/jars/* \ +--conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ --target-table table1 \ --props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \ ---schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ --workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type MERGE_ON_READ \ ---compact-scheduling-minshare 1 +--compact-scheduling-minshare 1 \ +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +--clean-input \ +--clean-output ``` +## Visualize and inspect the hoodie metrics and performance (local) +Graphite server is already setup (and up) in ```docker/setup_demo.sh```. + +Open browser and access metrics at +``` +http://localhost:80 +``` +Dashboard +``` +http://localhost/dashboard + +``` + ## Running long running test suite in Local Docker environment For long running test suite, validation has to be done differently. Idea is to run same dag in a repeated manner for @@ -279,12 +308,12 @@ contents both via spark datasource and hive table via spark sql engine. Hive val If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not go well w/ hive2* jars. So, after running docker setup, follow the below steps. ``` -docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar adhoc-2:/opt/ -docker cp demo/config/test-suite/test.properties adhoc-2:/opt/ +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/ +docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/ ``` Also copy your dag of interest to adhoc-2:/opt/ ``` -docker cp demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/ +docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/ ``` For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins". @@ -428,7 +457,7 @@ spark-submit \ --conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.8.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ @@ -446,6 +475,14 @@ spark-submit \ --clean-output ``` +If you wish to enable metrics add below properties as well +``` +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +``` + Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running dags. ``` diff --git a/hudi-integ-test/pom.xml b/hudi-integ-test/pom.xml index e0026e438d0d4..c697d6cc0102a 100644 --- a/hudi-integ-test/pom.xml +++ b/hudi-integ-test/pom.xml @@ -304,6 +304,7 @@ org.apache.hadoop hadoop-hdfs tests + test javax.servlet diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java index 0bcbaf8f8d9dd..d8ed649d9b08b 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java @@ -101,7 +101,7 @@ public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throw this.cfg = cfg; this.jsc = jsc; cfg.propsFilePath = FSUtils.addSchemeIfLocalPath(cfg.propsFilePath).toString(); - this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate(); + this.sparkSession = SparkSession.builder().config(jsc.getConf()).enableHiveSupport().getOrCreate(); this.fs = FSUtils.getFs(cfg.inputBasePath, jsc.hadoopConfiguration()); this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); log.info("Creating workload generator with configs : {}", props.toString()); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java index 9ac917acb08b5..41ef3f4ab968c 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java @@ -18,6 +18,7 @@ package org.apache.hudi.integ.testsuite; +import java.io.IOException; import java.io.Serializable; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -236,7 +237,7 @@ public void inlineClustering() { public Option scheduleCompaction(Option> previousCommitExtraMetadata) throws Exception { - if (!cfg.useDeltaStreamer) { + if (cfg.useDeltaStreamer) { deltaStreamerWrapper.scheduleCompact(); return Option.empty(); } else { @@ -251,7 +252,7 @@ public void commit(JavaRDD records, JavaRDD genera /** Store the checkpoint in the commit metadata just like * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} **/ extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get()); - if (generatedDataStats != null) { + if (generatedDataStats != null && generatedDataStats.count() > 1) { // Just stores the path where this batch of data is generated to extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0)); } @@ -259,6 +260,21 @@ public void commit(JavaRDD records, JavaRDD genera } } + public void commitCompaction(JavaRDD records, JavaRDD generatedDataStats, + Option instantTime) throws IOException { + if (!cfg.useDeltaStreamer) { + Map extraMetadata = new HashMap<>(); + /** Store the checkpoint in the commit metadata just like + * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} **/ + extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get()); + if (generatedDataStats != null && generatedDataStats.count() > 1) { + // Just stores the path where this batch of data is generated to + extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0)); + } + writeClient.commitCompaction(instantTime.get(), records, Option.of(extraMetadata)); + } + } + public SparkRDDWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessException { if (cfg.useDeltaStreamer & !allowWriteClientAccess(dagNode)) { throw new IllegalAccessException("cannot access write client when testing in deltastreamer mode"); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java index d8ea946fe553a..b0ae06b6039d4 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java @@ -18,14 +18,14 @@ package org.apache.hudi.integ.testsuite.configuration; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.hadoop.conf.Configuration; - import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; @@ -42,7 +42,7 @@ public class DeltaConfig implements Serializable { private final SerializableConfiguration configuration; public DeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType, - SerializableConfiguration configuration) { + SerializableConfiguration configuration) { this.deltaOutputMode = deltaOutputMode; this.deltaInputType = deltaInputType; this.configuration = configuration; @@ -96,6 +96,33 @@ public static class Config { private static String NUM_ROLLBACKS = "num_rollbacks"; private static String ENABLE_ROW_WRITING = "enable_row_writing"; + // Spark SQL Create Table + private static String TABLE_TYPE = "table_type"; + private static String IS_EXTERNAL = "is_external"; + private static String USE_CTAS = "use_ctas"; + private static String PRIMARY_KEY = "primary_key"; + private static String PRE_COMBINE_FIELD = "pre_combine_field"; + private static String PARTITION_FIELD = "partition_field"; + // Spark SQL Merge + private static String MERGE_CONDITION = "merge_condition"; + private static String DEFAULT_MERGE_CONDITION = "target._row_key = source._row_key"; + private static String MERGE_MATCHED_ACTION = "matched_action"; + private static String DEFAULT_MERGE_MATCHED_ACTION = "update set *"; + private static String MERGE_NOT_MATCHED_ACTION = "not_matched_action"; + private static String DEFAULT_MERGE_NOT_MATCHED_ACTION = "insert *"; + // Spark SQL Update + // column to update. The logic is fixed, i.e., to do "fare = fare * 1.6". to be fixed. + private static String UPDATE_COLUMN = "update_column"; + private static String DEFAULT_UPDATE_COLUMN = "fare"; + private static String WHERE_CONDITION_COLUMN = "condition_column"; + // the where condition expression is like "begin_lon between 0.1 and 0.2" + // the value range is determined by the ratio of records to update or delete + // only support numeric type column for now + private static String DEFAULT_WHERE_CONDITION_COLUMN = "begin_lon"; + // the ratio range is between 0.01 and 1.0. The ratio is approximate to the actual ratio achieved + private static String RATIO_RECORDS_CHANGE = "ratio_records_change"; + private static double DEFAULT_RATIO_RECORDS_CHANGE = 0.5; + private Map configsMap; public Config(Map configsMap) { @@ -194,6 +221,58 @@ public boolean enableRowWriting() { return Boolean.valueOf(configsMap.getOrDefault(ENABLE_ROW_WRITING, false).toString()); } + public Option getTableType() { + return !configsMap.containsKey(TABLE_TYPE) ? Option.empty() + : Option.of(configsMap.get(TABLE_TYPE).toString()); + } + + public boolean shouldUseCtas() { + return Boolean.valueOf(configsMap.getOrDefault(USE_CTAS, false).toString()); + } + + public boolean isTableExternal() { + return Boolean.valueOf(configsMap.getOrDefault(IS_EXTERNAL, false).toString()); + } + + public Option getPrimaryKey() { + return !configsMap.containsKey(PRIMARY_KEY) ? Option.empty() + : Option.of(configsMap.get(PRIMARY_KEY).toString()); + } + + public Option getPreCombineField() { + return !configsMap.containsKey(PRE_COMBINE_FIELD) ? Option.empty() + : Option.of(configsMap.get(PRE_COMBINE_FIELD).toString()); + } + + public Option getPartitionField() { + return !configsMap.containsKey(PARTITION_FIELD) ? Option.empty() + : Option.of(configsMap.get(PARTITION_FIELD).toString()); + } + + public String getMergeCondition() { + return configsMap.getOrDefault(MERGE_CONDITION, DEFAULT_MERGE_CONDITION).toString(); + } + + public String getMatchedAction() { + return configsMap.getOrDefault(MERGE_MATCHED_ACTION, DEFAULT_MERGE_MATCHED_ACTION).toString(); + } + + public String getNotMatchedAction() { + return configsMap.getOrDefault(MERGE_NOT_MATCHED_ACTION, DEFAULT_MERGE_NOT_MATCHED_ACTION).toString(); + } + + public String getUpdateColumn() { + return configsMap.getOrDefault(UPDATE_COLUMN, DEFAULT_UPDATE_COLUMN).toString(); + } + + public String getWhereConditionColumn() { + return configsMap.getOrDefault(WHERE_CONDITION_COLUMN, DEFAULT_WHERE_CONDITION_COLUMN).toString(); + } + + public double getRatioRecordsChange() { + return Double.valueOf(configsMap.getOrDefault(RATIO_RECORDS_CHANGE, DEFAULT_RATIO_RECORDS_CHANGE).toString()); + } + public Map getOtherConfigs() { if (configsMap == null) { return new HashMap<>(); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java new file mode 100644 index 0000000000000..9a369bcbd7dd9 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.hudi.integ.testsuite.schema.SchemaUtils; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.ReduceFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$; +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder; +import org.apache.spark.sql.catalyst.encoders.RowEncoder; +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; + +import scala.Tuple2; +import scala.collection.JavaConversions; +import scala.collection.JavaConverters; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * This nodes validates contents from input path are in tact with Hudi. By default no configs are required for this node. But there is an + * optional config "delete_input_data" that you can set for this node. If set, once validation completes, contents from inputPath are deleted. This will come in handy for long running test suites. + * README has more details under docker set up for usages of this node. + */ +public abstract class BaseValidateDatasetNode extends DagNode { + + public BaseValidateDatasetNode(DeltaConfig.Config config) { + this.config = config; + } + + /** + * @return {@link Logger} instance to use. + */ + public abstract Logger getLogger(); + + /** + * @param session {@link SparkSession} instance to use. + * @param context {@link ExecutionContext} instance to use. + * @param inputSchema input schema in {@link StructType} + * @return data in {@link Dataset} to validate. + */ + public abstract Dataset getDatasetToValidate(SparkSession session, ExecutionContext context, + StructType inputSchema); + + @Override + public void execute(ExecutionContext context, int curItrCount) throws Exception { + + SparkSession session = SparkSession.builder().sparkContext(context.getJsc().sc()).getOrCreate(); + // todo: Fix partitioning schemes. For now, assumes data based partitioning. + String inputPath = context.getHoodieTestSuiteWriter().getCfg().inputBasePath + "/*/*"; + log.warn("Validation using data from input path " + inputPath); + // listing batches to be validated + String inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; + if (log.isDebugEnabled()) { + FileSystem fs = new Path(inputPathStr) + .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); + FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); + log.info("fileStatuses length: " + fileStatuses.length); + for (FileStatus fileStatus : fileStatuses) { + log.debug("Listing all Micro batches to be validated :: " + fileStatus.getPath().toString()); + } + } + + Dataset inputSnapshotDf = getInputDf(context, session, inputPath); + + // read from hudi and remove meta columns. + Dataset trimmedHudiDf = getDatasetToValidate(session, context, inputSnapshotDf.schema()); + Dataset intersectionDf = inputSnapshotDf.intersect(trimmedHudiDf); + long inputCount = inputSnapshotDf.count(); + long outputCount = trimmedHudiDf.count(); + log.debug("Input count: " + inputCount + "; output count: " + outputCount); + // the intersected df should be same as inputDf. if not, there is some mismatch. + if (outputCount == 0 || inputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) { + log.error("Data set validation failed. Total count in hudi " + outputCount + ", input df count " + inputCount); + throw new AssertionError("Hudi contents does not match contents input data. "); + } + + if (config.isValidateHive()) { + String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key()); + String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key()); + log.warn("Validating hive table with db : " + database + " and table : " + tableName); + Dataset cowDf = session.sql("SELECT * FROM " + database + "." + tableName); + Dataset trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) + .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); + intersectionDf = inputSnapshotDf.intersect(trimmedCowDf); + outputCount = trimmedHudiDf.count(); + log.warn("Input count: " + inputCount + "; output count: " + outputCount); + // the intersected df should be same as inputDf. if not, there is some mismatch. + if (outputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) { + log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount); + throw new AssertionError("Hudi hive table contents does not match contents input data. "); + } + } + + // if delete input data is enabled, erase input data. + if (config.isDeleteInputData()) { + // clean up input data for current group of writes. + inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; + FileSystem fs = new Path(inputPathStr) + .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); + FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); + for (FileStatus fileStatus : fileStatuses) { + log.debug("Micro batch to be deleted " + fileStatus.getPath().toString()); + fs.delete(fileStatus.getPath(), true); + } + } + } + + private Dataset getInputDf(ExecutionContext context, SparkSession session, String inputPath) { + String recordKeyField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.RECORDKEY_FIELD().key()); + String partitionPathField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.PARTITIONPATH_FIELD().key()); + // todo: fix hard coded fields from configs. + // read input and resolve insert, updates, etc. + Dataset inputDf = session.read().format("avro").load(inputPath); + ExpressionEncoder encoder = getEncoder(inputDf.schema()); + return inputDf.groupByKey( + (MapFunction) value -> + value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField), Encoders.STRING()) + .reduceGroups((ReduceFunction) (v1, v2) -> { + int ts1 = v1.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); + int ts2 = v2.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); + if (ts1 > ts2) { + return v1; + } else { + return v2; + } + }) + .map((MapFunction, Row>) value -> value._2, encoder) + .filter("_hoodie_is_deleted != true"); + } + + + private ExpressionEncoder getEncoder(StructType schema) { + List attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream() + .map(Attribute::toAttribute).collect(Collectors.toList()); + return RowEncoder.apply(schema) + .resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(), + SimpleAnalyzer$.MODULE$); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java index 61306d12bcee3..dd7d880f6aef6 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + import org.apache.spark.api.java.JavaRDD; /** @@ -40,8 +41,8 @@ public CompactNode(Config config) { * if it has one. * * @param executionContext Execution context to run this compaction - * @param curItrCount cur interation count. - * @throws Exception will be thrown if any error occurred. + * @param curItrCount cur interation count. + * @throws Exception will be thrown if any error occurred. */ @Override public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { @@ -53,7 +54,7 @@ public void execute(ExecutionContext executionContext, int curItrCount) throws E if (lastInstant.isPresent()) { log.info("Compacting instant {}", lastInstant.get()); this.result = executionContext.getHoodieTestSuiteWriter().compact(Option.of(lastInstant.get().getTimestamp())); + executionContext.getHoodieTestSuiteWriter().commitCompaction(result, executionContext.getJsc().emptyRDD(), Option.of(lastInstant.get().getTimestamp())); } } - } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java index 5ca98ccf62ff6..f5cf56b99c3ce 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java @@ -60,6 +60,7 @@ protected void generate(DeltaGenerator deltaGenerator) throws Exception { if (!config.isDisableGenerate()) { log.info("Generating input data for node {}", this.getName()); this.deltaWriteStatsRDD = deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)); + this.deltaWriteStatsRDD.cache(); this.deltaWriteStatsRDD.count(); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java index 62bf9b09a5793..0297bc70384f0 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java @@ -56,5 +56,4 @@ public void execute(ExecutionContext executionContext, int curItrCount) throws E this.result = scheduledInstant; } } - } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java index 09e27c257e25e..03b37a9fc2b39 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java @@ -18,133 +18,39 @@ package org.apache.hudi.integ.testsuite.dag.nodes; -import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; -import org.apache.hudi.integ.testsuite.schema.SchemaUtils; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.ReduceFunction; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$; -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder; -import org.apache.spark.sql.catalyst.encoders.RowEncoder; -import org.apache.spark.sql.catalyst.expressions.Attribute; import org.apache.spark.sql.types.StructType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.List; -import java.util.stream.Collectors; - -import scala.Tuple2; -import scala.collection.JavaConversions; -import scala.collection.JavaConverters; - /** - * This nodes validates contents from input path are in tact with Hudi. This nodes uses spark datasource for comparison purposes. By default no configs are required for this node. But there is an - * optional config "delete_input_data" that you can set for this node. If set, once validation completes, contents from inputPath are deleted. This will come in handy for long running test suites. - * README has more details under docker set up for usages of this node. + * This validation node uses spark datasource for comparison purposes. */ -public class ValidateDatasetNode extends DagNode { +public class ValidateDatasetNode extends BaseValidateDatasetNode { private static Logger log = LoggerFactory.getLogger(ValidateDatasetNode.class); public ValidateDatasetNode(Config config) { - this.config = config; + super(config); } @Override - public void execute(ExecutionContext context, int curItrCount) throws Exception { - - SparkSession session = SparkSession.builder().sparkContext(context.getJsc().sc()).getOrCreate(); + public Logger getLogger() { + return log; + } - // todo: Fix partitioning schemes. For now, assumes data based partitioning. - String inputPath = context.getHoodieTestSuiteWriter().getCfg().inputBasePath + "/*/*"; + @Override + public Dataset getDatasetToValidate(SparkSession session, ExecutionContext context, + StructType inputSchema) { String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*"; - log.warn("ValidateDataset Node: Input path " + inputPath + ", hudi path " + hudiPath); - // listing batches to be validated - String inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; - FileSystem fs = new Path(inputPathStr) - .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); - FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); - for (FileStatus fileStatus : fileStatuses) { - log.debug("Listing all Micro batches to be validated :: " + fileStatus.getPath().toString()); - } - - String recordKeyField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.RECORDKEY_FIELD().key()); - String partitionPathField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.PARTITIONPATH_FIELD().key()); - // todo: fix hard coded fields from configs. - // read input and resolve insert, updates, etc. - Dataset inputDf = session.read().format("avro").load(inputPath); - ExpressionEncoder encoder = getEncoder(inputDf.schema()); - Dataset inputSnapshotDf = inputDf.groupByKey( - (MapFunction) value -> partitionPathField + "+" + recordKeyField, Encoders.STRING()) - .reduceGroups((ReduceFunction) (v1, v2) -> { - int ts1 = v1.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); - int ts2 = v2.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); - if (ts1 > ts2) { - return v1; - } else { - return v2; - } - }) - .map((MapFunction, Row>) value -> value._2, encoder) - .filter("_hoodie_is_deleted is NULL"); - - // read from hudi and remove meta columns. + log.info("Validate data in target hudi path " + hudiPath); Dataset hudiDf = session.read().format("hudi").load(hudiPath); - Dataset trimmedDf = hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) - .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); - - Dataset intersectionDf = inputSnapshotDf.intersect(trimmedDf); - // the intersected df should be same as inputDf. if not, there is some mismatch. - if (inputSnapshotDf.except(intersectionDf).count() != 0) { - log.error("Data set validation failed. Total count in hudi " + trimmedDf.count() + ", input df count " + inputSnapshotDf.count()); - throw new AssertionError("Hudi contents does not match contents input data. "); - } - - if (config.isValidateHive()) { - String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key()); - String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key()); - log.warn("Validating hive table with db : " + database + " and table : " + tableName); - Dataset cowDf = session.sql("SELECT * FROM " + database + "." + tableName); - Dataset trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) - .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); - intersectionDf = inputSnapshotDf.intersect(trimmedDf); - // the intersected df should be same as inputDf. if not, there is some mismatch. - if (inputSnapshotDf.except(intersectionDf).count() != 0) { - log.error("Data set validation failed for COW hive table. Total count in hudi " + trimmedCowDf.count() + ", input df count " + inputSnapshotDf.count()); - throw new AssertionError("Hudi hive table contents does not match contents input data. "); - } - } - - // if delete input data is enabled, erase input data. - if (config.isDeleteInputData()) { - // clean up input data for current group of writes. - inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; - fs = new Path(inputPathStr) - .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); - fileStatuses = fs.listStatus(new Path(inputPathStr)); - for (FileStatus fileStatus : fileStatuses) { - log.debug("Micro batch to be deleted " + fileStatus.getPath().toString()); - fs.delete(fileStatus.getPath(), true); - } - } - } - - private ExpressionEncoder getEncoder(StructType schema) { - List attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream() - .map(Attribute::toAttribute).collect(Collectors.toList()); - return RowEncoder.apply(schema) - .resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(), - SimpleAnalyzer$.MODULE$); + return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) + .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java index e004b3beac9ac..6d5bc4ffedeca 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java @@ -44,6 +44,7 @@ import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader; import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader; import org.apache.hudi.integ.testsuite.reader.DeltaInputReader; +import org.apache.hudi.integ.testsuite.schema.SchemaUtils; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter; @@ -75,7 +76,7 @@ public class DeltaGenerator implements Serializable { private int batchId; public DeltaGenerator(DFSDeltaConfig deltaOutputConfig, JavaSparkContext jsc, SparkSession sparkSession, - String schemaStr, BuiltinKeyGenerator keyGenerator) { + String schemaStr, BuiltinKeyGenerator keyGenerator) { this.deltaOutputConfig = deltaOutputConfig; this.jsc = jsc; this.sparkSession = sparkSession; @@ -123,7 +124,11 @@ public JavaRDD generateInserts(Config operation) { .mapPartitionsWithIndex((index, p) -> { return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition, minPayloadSize, schemaStr, partitionPathFieldNames, numPartitions, startPartition)); - }, true); + }, true) + .map(record -> { + record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId); + return record; + }); if (deltaOutputConfig.getInputParallelism() < numPartitions) { inputBatch = inputBatch.coalesce(deltaOutputConfig.getInputParallelism()); @@ -167,7 +172,11 @@ public JavaRDD generateUpdates(Config config) throws IOException log.info("Repartitioning records done for updates"); UpdateConverter converter = new UpdateConverter(schemaStr, config.getRecordSize(), partitionPathFieldNames, recordRowKeyFieldNames); - JavaRDD updates = converter.convert(adjustedRDD); + JavaRDD convertedRecords = converter.convert(adjustedRDD); + JavaRDD updates = convertedRecords.map(record -> { + record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId); + return record; + }); updates.persist(StorageLevel.DISK_ONLY()); if (inserts == null) { inserts = updates; @@ -205,11 +214,16 @@ public JavaRDD generateDeletes(Config config) throws IOException .getNumRecordsDelete()); } } + log.info("Repartitioning records for delete"); // persist this since we will make multiple passes over this adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism()); Converter converter = new DeleteConverter(schemaStr, config.getRecordSize()); - JavaRDD deletes = converter.convert(adjustedRDD); + JavaRDD convertedRecords = converter.convert(adjustedRDD); + JavaRDD deletes = convertedRecords.map(record -> { + record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId); + return record; + }); deletes.persist(StorageLevel.DISK_ONLY()); return deletes; } else { diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java index 1fd3d2f966cd7..2648740f54e0f 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java @@ -88,7 +88,7 @@ protected List getPartitions(Option partitionsLimit) throws IOE // calls in metrics as they are not part of normal HUDI operation. HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); List partitionPaths = FSUtils.getAllPartitionPaths(engineContext, metaClient.getBasePath(), - HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(), false); + HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false); // Sort partition so we can pick last N partitions by default Collections.sort(partitionPaths); if (!partitionPaths.isEmpty()) { diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkBulkInsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkBulkInsertNode.scala index 4d17570fc8d6a..6654264a969c9 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkBulkInsertNode.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkBulkInsertNode.scala @@ -30,11 +30,12 @@ import scala.collection.JavaConverters._ /** * Spark datasource based bulk insert node - * @param config1 + * + * @param dagNodeConfig DAG node configurations. */ -class SparkBulkInsertNode(config1: Config) extends DagNode[RDD[WriteStatus]] { +class SparkBulkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { - config = config1 + config = dagNodeConfig /** * Execute the {@link DagNode}. diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeleteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeleteNode.scala index 4ebd59d8f1710..645787a873e04 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeleteNode.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeleteNode.scala @@ -19,6 +19,7 @@ package org.apache.hudi.integ.testsuite.dag.nodes import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord import org.apache.hudi.client.WriteStatus import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config @@ -32,12 +33,13 @@ import scala.collection.JavaConverters._ /** * Spark datasource based upsert node - * @param config1 + * + * @param dagNodeConfig DAG node configurations. */ -class SparkDeleteNode(config1: Config) extends DagNode[RDD[WriteStatus]] { +class SparkDeleteNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { private val log = LogManager.getLogger(getClass) - config = config1 + config = dagNodeConfig /** * Execute the {@link DagNode}. @@ -47,20 +49,9 @@ class SparkDeleteNode(config1: Config) extends DagNode[RDD[WriteStatus]] { * @throws Exception Thrown if the execution failed. */ override def execute(context: ExecutionContext, curItrCount: Int): Unit = { - if (!config.isDisableGenerate) { - println("Generating input data for node {}", this.getName) - context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateDeletes(config)).count() - } - // Deletes can't be fetched using getNextBatch() bcoz, getInsert(schema) from payload will return empty for delete // records - context.getWriterContext.getHoodieTestSuiteWriter.getNextBatchForDeletes() - val pathToRead = context.getWriterContext.getCfg.inputBasePath + "/" + context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse("") - - val avroDf = context.getWriterContext.getSparkSession.read.format("avro").load(pathToRead) - val genRecsRDD = HoodieSparkUtils.createRdd(avroDf, "testStructName","testNamespace", false, - org.apache.hudi.common.util.Option.of(new Schema.Parser().parse(context.getWriterContext.getHoodieTestSuiteWriter.getSchema))) - + val genRecsRDD = generateRecordsForDelete(config, context) val inputDF = AvroConversionUtils.createDataFrame(genRecsRDD, context.getWriterContext.getHoodieTestSuiteWriter.getSchema, context.getWriterContext.getSparkSession) @@ -75,4 +66,24 @@ class SparkDeleteNode(config1: Config) extends DagNode[RDD[WriteStatus]] { .mode(SaveMode.Append) .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) } + + /** + * Generates records for delete operations in Spark. + * + * @param config Node configs. + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + private def generateRecordsForDelete(config: Config, context: ExecutionContext): RDD[GenericRecord] = { + if (!config.isDisableGenerate) { + context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateDeletes(config)).count() + } + + context.getWriterContext.getHoodieTestSuiteWriter.getNextBatchForDeletes() + val pathToRead = context.getWriterContext.getCfg.inputBasePath + "/" + context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse("") + + val avroDf = context.getWriterContext.getSparkSession.read.format("avro").load(pathToRead) + HoodieSparkUtils.createRdd(avroDf, "testStructName", "testNamespace", false, + org.apache.hudi.common.util.Option.of(new Schema.Parser().parse(context.getWriterContext.getHoodieTestSuiteWriter.getSchema))) + } } diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala index f962e41bd409f..1b69cf8faf494 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala @@ -30,11 +30,12 @@ import scala.collection.JavaConverters._ /** * Spark datasource based insert node - * @param config1 + * + * @param dagNodeConfig DAG node configurations. */ -class SparkInsertNode(config1: Config) extends DagNode[RDD[WriteStatus]] { +class SparkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { - config = config1 + config = dagNodeConfig /** * Execute the {@link DagNode}. diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala index 6486fede1d2da..858827a7b2c47 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala @@ -30,11 +30,12 @@ import scala.collection.JavaConverters._ /** * Spark datasource based upsert node - * @param config1 + * + * @param dagNodeConfig DAG node configurations. */ -class SparkUpsertNode(config1: Config) extends DagNode[RDD[WriteStatus]] { +class SparkUpsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { - config = config1 + config = dagNodeConfig /** * Execute the {@link DagNode}. diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/BaseSparkSqlNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/BaseSparkSqlNode.scala new file mode 100644 index 0000000000000..ce6a40efbced0 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/BaseSparkSqlNode.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.AvroConversionUtils +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD +import org.slf4j.{Logger, LoggerFactory} + +/** + * Abstract class for DAG node of running Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +abstract class BaseSparkSqlNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { + + val LOG: Logger = LoggerFactory.getLogger(this.getClass) + val TEMP_TABLE_NAME = "_spark_sql_temp_table" + config = dagNodeConfig + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + def queryToRun(config: Config, context: ExecutionContext): String + + /** + * Prepares the data for the Spark write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + if (!config.isDisableGenerate) { + context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateInserts(config)).count() + } + context.getWriterContext.getHoodieTestSuiteWriter.getNextBatch + } + + /** + * @return Name of the temp table containing the input data. + */ + def getTempTableName(): String = { + TEMP_TABLE_NAME + } + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + LOG.info("Run query in Spark SQL ...") + val nextBatch = prepareData(context) + val sparkSession = context.getWriterContext.getSparkSession + val inputDF = AvroConversionUtils.createDataFrame(nextBatch, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + sparkSession) + inputDF.createOrReplaceTempView(TEMP_TABLE_NAME) + + val query = queryToRun(config, context) + SparkSqlUtils.logQuery(LOG, query) + sparkSession.sql(query) + LOG.info("Finish run query in Spark SQL.") + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlCreateTableNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlCreateTableNode.scala new file mode 100644 index 0000000000000..3db6aa2ccf557 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlCreateTableNode.scala @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hadoop.fs.Path +import org.apache.hudi.AvroConversionUtils +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD +import org.slf4j.{Logger, LoggerFactory} + +/** + * DAG node of create table using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlCreateTableNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { + + val LOG: Logger = LoggerFactory.getLogger(classOf[SparkSqlCreateTableNode]) + val TEMP_TABLE_NAME: String = "_spark_sql_temp_table" + + config = dagNodeConfig + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + LOG.info("Creating table in Spark SQL ...") + val sparkSession = context.getWriterContext.getSparkSession + val targetTableName = context.getWriterContext.getCfg.targetTableName + val targetBasePath = context.getWriterContext.getCfg.targetBasePath + "_sql" + + if (config.shouldUseCtas) { + // Prepares data for CTAS query + if (!config.isDisableGenerate) { + context.getDeltaGenerator.writeRecords(context.getDeltaGenerator.generateInserts(config)).count() + } + val nextBatch = context.getWriterContext.getHoodieTestSuiteWriter.getNextBatch + val sparkSession = context.getWriterContext.getSparkSession + val inputDF = AvroConversionUtils.createDataFrame(nextBatch, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + sparkSession) + inputDF.createOrReplaceTempView(TEMP_TABLE_NAME) + } + + // Cleans up the table + sparkSession.sql("drop table if exists " + targetTableName) + if (config.isTableExternal) { + LOG.info("Clean up " + targetBasePath) + val fs = FSUtils.getFs(targetBasePath, context.getJsc.hadoopConfiguration()) + val targetPath = new Path(targetBasePath) + if (fs.exists(targetPath)) { + fs.delete(targetPath, true) + } + } + + // Executes the create table query + val createTableQuery = SparkSqlUtils.constructCreateTableQuery( + config, targetTableName, targetBasePath, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, TEMP_TABLE_NAME) + SparkSqlUtils.logQuery(LOG, createTableQuery) + sparkSession.sql(createTableQuery) + val targetTableCount = sparkSession.sql("select * from " + targetTableName) + LOG.info("Target table count: " + targetTableCount.count()) + LOG.info("Finish create table in Spark SQL.") + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlDeleteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlDeleteNode.scala new file mode 100644 index 0000000000000..847381f8cc588 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlDeleteNode.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD + +/** + * DAG node of delete using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlDeleteNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Prepares the data for the write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + override def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + val sparkSession = context.getWriterContext.getSparkSession + val recordsToDelete = SparkSqlUtils.generateDeleteRecords( + config, sparkSession, context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + context.getWriterContext.getCfg.targetTableName, sparkSession.sparkContext.defaultParallelism) + LOG.info("Number of records to delete: " + recordsToDelete.count()) + // The update records corresponding to the SQL are only used for data validation + context.getDeltaGenerator().writeRecords(recordsToDelete).count() + recordsToDelete + } + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + SparkSqlUtils.constructDeleteQuery(config, context.getWriterContext.getSparkSession, + context.getWriterContext.getCfg.targetTableName) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertNode.scala new file mode 100644 index 0000000000000..6fc79f4ab33dc --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertNode.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils + +/** + * DAG node of insert using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlInsertNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + val targetTableName = context.getWriterContext.getCfg.targetTableName + SparkSqlUtils.constructInsertQuery( + "into", targetTableName, + SparkSqlUtils.getTableSchema(context.getWriterContext.getSparkSession, targetTableName), + getTempTableName()) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertOverwriteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertOverwriteNode.scala new file mode 100644 index 0000000000000..248b70d545e5f --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertOverwriteNode.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils + +/** + * DAG node of insert overwrite using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlInsertOverwriteNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + val targetTableName = context.getWriterContext.getCfg.targetTableName + SparkSqlUtils.constructInsertQuery( + "overwrite", targetTableName, + SparkSqlUtils.getTableSchema(context.getWriterContext.getSparkSession, targetTableName), + getTempTableName()) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlMergeNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlMergeNode.scala new file mode 100644 index 0000000000000..b03230beb4cbc --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlMergeNode.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD + +/** + * DAG node of merge using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlMergeNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Prepares the data for the Spark write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + override def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + if (!config.isDisableGenerate) { + context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateUpdates(config)).count() + } + context.getWriterContext.getHoodieTestSuiteWriter.getNextBatch + } + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + val targetTableName = context.getWriterContext.getCfg.targetTableName + SparkSqlUtils.constructMergeQuery( + config, targetTableName, + SparkSqlUtils.getTableSchema(context.getWriterContext.getSparkSession, targetTableName), + getTempTableName()) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlUpdateNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlUpdateNode.scala new file mode 100644 index 0000000000000..fdc799feaddec --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlUpdateNode.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD + +/** + * DAG node of update using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlUpdateNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Prepares the data for the Spark write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + override def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + val sparkSession = context.getWriterContext.getSparkSession + val recordsToUpdate = SparkSqlUtils.generateUpdateRecords( + config, sparkSession, context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + context.getWriterContext.getCfg.targetTableName, sparkSession.sparkContext.defaultParallelism) + LOG.info("Number of records to update: " + recordsToUpdate.count()) + // The update records corresponding to the SQL are only used for data validation + context.getDeltaGenerator().writeRecords(recordsToUpdate).count() + recordsToUpdate + } + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + SparkSqlUtils.constructUpdateQuery(config, context.getWriterContext.getSparkSession, + context.getWriterContext.getCfg.targetTableName) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlValidateDatasetNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlValidateDatasetNode.scala new file mode 100644 index 0000000000000..01804baa9f148 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlValidateDatasetNode.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.BaseValidateDatasetNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Dataset, Row, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +/** + * This validation node uses Spark SQL to get data for comparison purposes. + */ +class SparkSqlValidateDatasetNode(dagNodeConfig: Config) extends BaseValidateDatasetNode(dagNodeConfig) { + + val LOG: Logger = LoggerFactory.getLogger(classOf[SparkSqlValidateDatasetNode]) + + config = dagNodeConfig + + /** + * @return {@link Logger} instance to use. + */ + override def getLogger: Logger = LOG + + /** + * @param session {@link SparkSession} instance to use. + * @param context {@link ExecutionContext} instance to use. + * @param inputSchema input schema in {@link StructType} + * @return data in {@link Dataset< Row >} to validate. + */ + override def getDatasetToValidate(session: SparkSession, context: ExecutionContext, + inputSchema: StructType): Dataset[Row] = { + val tableName = context.getWriterContext.getCfg.targetTableName + LOG.info("Validate data in table " + tableName) + val sortedInputFieldNames = inputSchema.fieldNames.sorted + val tableSchema = session.table(tableName).schema + val sortedTableFieldNames = tableSchema.fieldNames + .filter(field => !HoodieRecord.HOODIE_META_COLUMNS.contains(field)).sorted + if (!(sortedInputFieldNames sameElements sortedTableFieldNames)) { + LOG.error("Input schema: ") + inputSchema.printTreeString() + LOG.error("Table schema: ") + tableSchema.printTreeString() + throw new AssertionError("Data set validation failed. The schema does not match.") + } + session.sql(SparkSqlUtils.constructSelectQuery(inputSchema, tableName)) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/utils/SparkSqlUtils.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/utils/SparkSqlUtils.scala new file mode 100644 index 0000000000000..fa16eae06b17e --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/utils/SparkSqlUtils.scala @@ -0,0 +1,526 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.utils + +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.HoodieSparkUtils +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.util.Option +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils.getFieldNamesAndTypes +import org.apache.hudi.utilities.schema.RowBasedSchemaProvider +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.sql.avro.SchemaConverters +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.storage.StorageLevel +import org.slf4j.Logger + +import scala.math.BigDecimal.RoundingMode.RoundingMode + +/** + * Utils for test nodes in Spark SQL + */ +object SparkSqlUtils { + + /** + * @param sparkSession spark session to use + * @param tableName table name + * @return table schema excluding meta columns in `StructType` + */ + def getTableSchema(sparkSession: SparkSession, tableName: String): StructType = { + new StructType(sparkSession.table(tableName).schema.fields + .filter(field => !HoodieRecord.HOODIE_META_COLUMNS.contains(field.name))) + } + + /** + * Converts Avro schema in String to the SQL schema expression, with partition fields at the end + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * and the partition columns Set("rider"), + * the SQL schema expression is: + * """ + * timestamp bigint, + * _row_key string, + * driver string, + * begin_lat double, + * begin_lon double, + * end_lat double, + * end_lon double, + * fare double, + * _hoodie_is_deleted boolean, + * rider string + * """ + * + * @param avroSchemaString Avro schema String + * @param partitionColumns partition columns + * @return corresponding SQL schema expression + */ + def convertAvroToSqlSchemaExpression(avroSchemaString: String, partitionColumns: Set[String]): String = { + val fields: Array[(String, String)] = getFieldNamesAndTypes(avroSchemaString) + val reorderedFields = fields.filter(field => !partitionColumns.contains(field._1)) ++ + fields.filter(field => partitionColumns.contains(field._1)) + reorderedFields.map(e => e._1 + " " + e._2).mkString(",\n") + } + + /** + * Converts Avro schema in String to an array of field names. + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * the output is + * ["timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", + * "fare", "_hoodie_is_deleted"] + * + * @param avroSchemaString Avro schema String + * @return an array of field names. + */ + def convertAvroToFieldNames(avroSchemaString: String): Array[String] = { + getFieldNamesAndTypes(avroSchemaString).map(e => e._1) + } + + /** + * Gets an array of field names and types from Avro schema String. + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * the output is + * [("timestamp", "bigint"), + * ("_row_key", "string"), + * ("rider", "string", + * ("driver", "string"), + * ("begin_lat", "double"), + * ("begin_lon", "double"), + * ("end_lat", "double"), + * ("end_lon", "double"), + * ("fare", "double"), + * ("_hoodie_is_deleted", "boolean")] + * + * @param avroSchemaString Avro schema String + * @return an array of field names and types + */ + def getFieldNamesAndTypes(avroSchemaString: String): Array[(String, String)] = { + val schema = new Schema.Parser().parse(avroSchemaString) + val structType = SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType] + structType.fields.map(field => (field.name, field.dataType.simpleString)) + } + + /** + * Logs the Spark SQL query to run. + * + * @param log {@link Logger} instance to use. + * @param query query String. + */ + def logQuery(log: Logger, query: String): Unit = { + log.warn("----- Running the following Spark SQL query -----") + log.warn(query) + log.warn("-" * 50) + } + + /** + * Constructs the select query. + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * and the partition columns Set("rider"), + * the output is + * """ + * select timestamp, _row_key, driver, begin_lat, begin_lon, end_lat, end_lon, fare, + * _hoodie_is_deleted, rider from _temp_table + * """ + * + * @param inputSchema input Avro schema String. + * @param partitionColumns partition columns + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(inputSchema: String, partitionColumns: Set[String], tableName: String): String = { + val fieldNames: Array[String] = SparkSqlUtils.convertAvroToFieldNames(inputSchema) + val reorderedFieldNames = fieldNames.filter(name => !partitionColumns.contains(name)) ++ + fieldNames.filter(name => partitionColumns.contains(name)) + constructSelectQuery(reorderedFieldNames, tableName) + } + + /** + * Constructs the select query with {@link StructType} columns in the select. + * + * @param structType {@link StructType} instance. + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(structType: StructType, tableName: String): String = { + constructSelectQuery(structType, Set.empty[String], tableName) + } + + /** + * Constructs the select query with {@link StructType} columns in the select and the partition + * columns at the end. + * + * @param structType {@link StructType} instance. + * @param partitionColumns partition columns in a {@link Set} + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(structType: StructType, partitionColumns: Set[String], tableName: String): String = { + val fieldNames: Array[String] = structType.fields.map(field => field.name) + val reorderedFieldNames = fieldNames.filter(name => !partitionColumns.contains(name)) ++ + fieldNames.filter(name => partitionColumns.contains(name)) + constructSelectQuery(reorderedFieldNames, tableName) + } + + /** + * Constructs the select query with a {@link Array} of String. + * + * @param fieldNames field names in String. + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(fieldNames: Array[String], tableName: String): String = { + val selectQueryBuilder = new StringBuilder("select "); + selectQueryBuilder.append(fieldNames.mkString(", ")) + selectQueryBuilder.append(" from ") + selectQueryBuilder.append(tableName) + selectQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL create table query based on the configs. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param targetBasePath target bash path for external table. + * @param inputSchema input Avro schema String. + * @param inputTableName name of the table containing input data. + * @return create table query. + */ + def constructCreateTableQuery(config: Config, targetTableName: String, targetBasePath: String, + inputSchema: String, inputTableName: String): String = { + // Constructs create table statement + val createTableQueryBuilder = new StringBuilder("create table ") + createTableQueryBuilder.append(targetTableName) + val partitionColumns: Set[String] = + if (config.getPartitionField.isPresent) Set(config.getPartitionField.get) else Set.empty + if (!config.shouldUseCtas) { + // Adds the schema statement if not using CTAS + createTableQueryBuilder.append(" (") + createTableQueryBuilder.append(SparkSqlUtils.convertAvroToSqlSchemaExpression(inputSchema, partitionColumns)) + createTableQueryBuilder.append("\n)") + } + createTableQueryBuilder.append(" using hudi") + val tableTypeOption = config.getTableType + val primaryKeyOption = config.getPrimaryKey + val preCombineFieldOption = config.getPreCombineField + + // Adds location for external table + if (config.isTableExternal) { + createTableQueryBuilder.append("\nlocation '" + targetBasePath + "'") + } + + // Adds options if set + var options = Array[String]() + if (tableTypeOption.isPresent) { + options :+= ("type = '" + tableTypeOption.get() + "'") + } + if (primaryKeyOption.isPresent) { + options :+= ("primaryKey = '" + primaryKeyOption.get() + "'") + } + if (preCombineFieldOption.isPresent) { + options :+= ("preCombineField = '" + preCombineFieldOption.get() + "'") + } + if (options.length > 0) { + createTableQueryBuilder.append(options.mkString("\noptions ( \n", ",\n", "\n)")) + } + + // Adds partition fields if set + val partitionFieldOption = config.getPartitionField + if (partitionFieldOption.isPresent) { + createTableQueryBuilder.append("\npartitioned by (" + partitionFieldOption.get() + ")") + } + + if (config.shouldUseCtas()) { + // Adds as select query + createTableQueryBuilder.append("\nas\n"); + createTableQueryBuilder.append(constructSelectQuery(inputSchema, partitionColumns, inputTableName)) + } + createTableQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL insert query based on the configs. + * + * @param insertType the insert type, in one of two types: "into" or "overwrite". + * @param targetTableName target table name. + * @param schema table schema to use + * @param inputTableName name of the table containing input data. + * @return insert query. + */ + def constructInsertQuery(insertType: String, targetTableName: String, schema: StructType, + inputTableName: String): String = { + // Constructs insert statement + val insertQueryBuilder = new StringBuilder("insert ") + insertQueryBuilder.append(insertType) + insertQueryBuilder.append(" ") + insertQueryBuilder.append(targetTableName) + insertQueryBuilder.append(" ") + insertQueryBuilder.append(constructSelectQuery(schema, inputTableName)) + insertQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL merge query based on the configs. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param schema table schema to use + * @param inputTableName name of the table containing input data. + * @return merge query. + */ + def constructMergeQuery(config: Config, targetTableName: String, schema: StructType, + inputTableName: String): String = { + val mergeQueryBuilder = new StringBuilder("merge into ") + mergeQueryBuilder.append(targetTableName) + mergeQueryBuilder.append(" as target using (\n") + mergeQueryBuilder.append(constructSelectQuery(schema, inputTableName)) + mergeQueryBuilder.append("\n) source\non ") + mergeQueryBuilder.append(config.getMergeCondition) + mergeQueryBuilder.append("\nwhen matched then ") + mergeQueryBuilder.append(config.getMatchedAction) + mergeQueryBuilder.append("\nwhen not matched then ") + mergeQueryBuilder.append(config.getNotMatchedAction) + mergeQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL update query based on the configs. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param targetTableName target table name. + * @return update query. + */ + def constructUpdateQuery(config: Config, sparkSession: SparkSession, + targetTableName: String): String = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val updateQueryBuilder = new StringBuilder("update ") + updateQueryBuilder.append(targetTableName) + updateQueryBuilder.append(" set ") + updateQueryBuilder.append(config.getUpdateColumn) + updateQueryBuilder.append(" = ") + updateQueryBuilder.append(config.getUpdateColumn) + updateQueryBuilder.append(" * 1.6 ") + updateQueryBuilder.append(" where ") + updateQueryBuilder.append(config.getWhereConditionColumn) + updateQueryBuilder.append(" between ") + updateQueryBuilder.append(bounds._1) + updateQueryBuilder.append(" and ") + updateQueryBuilder.append(bounds._2) + updateQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL delete query based on the configs. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param targetTableName target table name. + * @return delete query. + */ + def constructDeleteQuery(config: Config, sparkSession: SparkSession, + targetTableName: String): String = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val deleteQueryBuilder = new StringBuilder("delete from ") + deleteQueryBuilder.append(targetTableName) + deleteQueryBuilder.append(" where ") + deleteQueryBuilder.append(config.getWhereConditionColumn) + deleteQueryBuilder.append(" between ") + deleteQueryBuilder.append(bounds._1) + deleteQueryBuilder.append(" and ") + deleteQueryBuilder.append(bounds._2) + deleteQueryBuilder.toString() + } + + /** + * Generates the pair of percentile levels based on the ratio in the config. + * + * For example, given ratio as 0.4, the output is (0.3, 0.7). + * + * @param config DAG node configurations. + * @return the lower bound and upper bound percentiles. + */ + def generatePercentiles(config: Config): (Double, Double) = { + val ratio: Double = config.getRatioRecordsChange + (Math.max(0.5 - (ratio / 2.0), 0.0), Math.min(0.5 + (ratio / 2.0), 1.0)) + } + + /** + * @param number input double number + * @param mode rounding mode + * @return rounded double + */ + def roundDouble(number: Double, mode: RoundingMode): Double = { + BigDecimal(number).setScale(4, mode).toDouble + } + + /** + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param targetTableName target table name. + * @return lower and upper bound values based on the percentiles. + */ + def getLowerUpperBoundsFromPercentiles(config: Config, sparkSession: SparkSession, + targetTableName: String): (Double, Double) = { + val percentiles = generatePercentiles(config) + val result = sparkSession.sql(constructPercentileQuery(config, targetTableName, percentiles)).collect()(0) + (roundDouble(result.get(0).asInstanceOf[Double], BigDecimal.RoundingMode.HALF_DOWN), + roundDouble(result.get(1).asInstanceOf[Double], BigDecimal.RoundingMode.HALF_UP)) + } + + /** + * Constructs the query to get percentiles for the where condition. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param percentiles lower and upper percentiles. + * @return percentile query in String. + */ + def constructPercentileQuery(config: Config, targetTableName: String, + percentiles: (Double, Double)): String = { + val percentileQueryBuilder = new StringBuilder("select percentile(") + percentileQueryBuilder.append(config.getWhereConditionColumn) + percentileQueryBuilder.append(", ") + percentileQueryBuilder.append(percentiles._1) + percentileQueryBuilder.append("), percentile(") + percentileQueryBuilder.append(config.getWhereConditionColumn) + percentileQueryBuilder.append(", ") + percentileQueryBuilder.append(percentiles._2) + percentileQueryBuilder.append(") from ") + percentileQueryBuilder.append(targetTableName) + percentileQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL query to get update or delete records. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param avroSchemaString input Avro schema String. + * @param lowerBound lower bound value for the where condition. + * @param upperBound upper bound value for the where condition. + * @return delete query. + */ + def constructChangedRecordQuery(config: Config, targetTableName: String, avroSchemaString: String, + lowerBound: Double, upperBound: Double): String = { + val recordQueryBuilder = new StringBuilder(constructSelectQuery(avroSchemaString, Set.empty[String], targetTableName)) + recordQueryBuilder.append(" where ") + recordQueryBuilder.append(config.getWhereConditionColumn) + recordQueryBuilder.append(" between ") + recordQueryBuilder.append(lowerBound) + recordQueryBuilder.append(" and ") + recordQueryBuilder.append(upperBound) + recordQueryBuilder.toString() + } + + /** + * Generates the exact same records to update based on the SQL derived from the + * configs for data validation. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param avroSchemaString input Avro schema String. + * @param targetTableName target table name. + * @param parallelism parallelism for RDD + * @return records in {@link JavaRdd[ GenericRecord ]}. + */ + def generateUpdateRecords(config: Config, sparkSession: SparkSession, avroSchemaString: String, + targetTableName: String, parallelism: Int): JavaRDD[GenericRecord] = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val rows = sparkSession.sql( + constructChangedRecordQuery(config, targetTableName, avroSchemaString, bounds._1, bounds._2)) + + val rdd = HoodieSparkUtils + .createRdd(rows, RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, + RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE, reconcileToLatestSchema = false, Option.empty()) + .map(record => { + record.put(config.getUpdateColumn, record.get(config.getUpdateColumn).toString.toDouble * 1.6) + record + }) + .toJavaRDD() + val repartitionedRdd = rdd.repartition(parallelism) + repartitionedRdd.persist(StorageLevel.DISK_ONLY) + repartitionedRdd + } + + /** + * Generates the exact same records to delete based on the SQL derived from the + * configs for data validation. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param avroSchemaString input Avro schema String. + * @param targetTableName target table name. + * @param parallelism parallelism for RDD + * @return records in {@link JavaRdd[ GenericRecord ]}. + */ + def generateDeleteRecords(config: Config, sparkSession: SparkSession, avroSchemaString: String, + targetTableName: String, parallelism: Int): JavaRDD[GenericRecord] = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val rows = sparkSession.sql( + constructChangedRecordQuery(config, targetTableName, avroSchemaString, bounds._1, bounds._2)) + + val rdd = HoodieSparkUtils + .createRdd(rows, RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, + RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE, reconcileToLatestSchema = false, Option.empty()) + .map(record => { + record.put(GenericRecordFullPayloadGenerator.DEFAULT_HOODIE_IS_DELETED_COL, true) + record + }) + .toJavaRDD() + val repartitionedRdd = rdd.repartition(parallelism) + repartitionedRdd.persist(StorageLevel.DISK_ONLY) + repartitionedRdd + } +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java index be6e55226dea8..c32f44d1c5f20 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java @@ -18,12 +18,6 @@ package org.apache.hudi.integ.testsuite.job; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.util.UUID; -import java.util.stream.Stream; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieTableType; @@ -37,21 +31,31 @@ import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator; import org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.hudi.integ.testsuite.schema.SchemaUtils; +import org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; import org.apache.hudi.keygen.TimestampBasedKeyGenerator; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.sources.AvroDFSSource; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.log4j.Level; import org.apache.log4j.Logger; -import org.junit.jupiter.api.Test; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import java.util.UUID; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * Unit test against {@link HoodieTestSuiteJob}. */ @@ -72,6 +76,9 @@ public class TestHoodieTestSuiteJob extends UtilitiesTestBase { private static final String COW_DAG_FILE_NAME_SPARK_DATASOURCE_NODES = "unit-test-cow-dag-spark-datasource.yaml"; private static final String COW_DAG_SPARK_DATASOURCE_NODES_RELATIVE_PATH = "/hudi-integ-test/src/test/resources/unit-test-cow-dag-spark-datasource.yaml"; + private static final String SPARK_SQL_DAG_FILE_NAME = "unit-test-spark-sql-dag.yaml"; + private static final String SPARK_SQL_DAG_SOURCE_PATH = "/hudi-integ-test/src/test/resources/" + SPARK_SQL_DAG_FILE_NAME; + public static Stream configParams() { Object[][] data = new Object[][] {{false, "COPY_ON_WRITE"}}; @@ -102,6 +109,8 @@ public static void initClass() throws Exception { + COW_DAG_SPARK_DATASOURCE_NODES_RELATIVE_PATH, dfs, dfsBasePath + "/" + COW_DAG_FILE_NAME_SPARK_DATASOURCE_NODES); UtilitiesTestBase.Helpers.savePropsToDFS(getProperties(), dfs, dfsBasePath + "/test-source" + ".properties"); + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + SPARK_SQL_DAG_SOURCE_PATH, dfs, dfsBasePath + "/" + SPARK_SQL_DAG_FILE_NAME); // Properties used for the delta-streamer which incrementally pulls from upstream DFS Avro source and // writes to downstream hudi table @@ -269,22 +278,35 @@ public void testSparkDataSourceNodesDagWithLock() throws Exception { assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 3); } + @Test + public void testSparkSqlDag() throws Exception { + boolean useDeltaStreamer = false; + this.cleanDFSDirs(); + String inputBasePath = dfsBasePath + "/input"; + String outputBasePath = dfsBasePath + "/result"; + HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType + .COPY_ON_WRITE.name()); + cfg.workloadYamlPath = dfsBasePath + "/" + SPARK_SQL_DAG_FILE_NAME; + HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); + hoodieTestSuiteJob.runTestSuite(); + } + protected HoodieTestSuiteConfig makeConfig(String inputBasePath, String outputBasePath, boolean useDeltaStream, - String tableType) { + String tableType) { HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig(); cfg.targetBasePath = outputBasePath; cfg.inputBasePath = inputBasePath; cfg.targetTableName = "table1"; cfg.tableType = tableType; cfg.sourceClassName = AvroDFSSource.class.getName(); - cfg.sourceOrderingField = "timestamp"; + cfg.sourceOrderingField = SchemaUtils.SOURCE_ORDERING_FIELD; cfg.propsFilePath = dfsBasePath + "/test-source.properties"; cfg.outputTypeName = DeltaOutputMode.DFS.name(); cfg.inputFormatName = DeltaInputType.AVRO.name(); cfg.limitFileSize = 1024 * 1024L; cfg.sourceLimit = 20000000; cfg.workloadDagGenerator = WorkflowDagGenerator.class.getName(); - cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName(); + cfg.schemaProviderClassName = TestSuiteFileBasedSchemaProvider.class.getName(); cfg.useDeltaStreamer = useDeltaStream; return cfg; } diff --git a/hudi-integ-test/src/test/resources/unit-test-spark-sql-dag.yaml b/hudi-integ-test/src/test/resources/unit-test-spark-sql-dag.yaml new file mode 100644 index 0000000000000..0b4ff072a97f7 --- /dev/null +++ b/hudi-integ-test/src/test/resources/unit-test-spark-sql-dag.yaml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: unit-test-spark-sql-dag.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + #merge_records: + # config: + # merge_condition: target._row_key = source._row_key + # matched_action: update set * + # not_matched_action: insert * + # record_size: 1000 + # num_partitions_insert: 10 + # repeat_count: 1 + # num_records_upsert: 100 + # num_records_insert: 1000 + # type: spark.sql.SparkSqlMergeNode + # deps: insert_records + #delete_records: + # config: + # condition_column: begin_lat + # record_size: 1000 + # repeat_count: 1 + # ratio_records_change: 0.2 + # type: spark.sql.SparkSqlDeleteNode + # deps: insert_records + #validate: + # config: + # delete_input_data: true + # type: spark.sql.SparkSqlValidateDatasetNode + # deps: delete_records diff --git a/hudi-kafka-connect/README.md b/hudi-kafka-connect/README.md index fd0a5d010794d..e2abab1ccbabd 100644 --- a/hudi-kafka-connect/README.md +++ b/hudi-kafka-connect/README.md @@ -15,32 +15,36 @@ * See the License for the specific language governing permissions and --> -# Quick Start guide for Kafka Connect Sink for Hudi +# Quick Start (demo) guide for Kafka Connect Sink for Hudi This repo contains a sample project that can be used to start off your own source connector for Kafka Connect. +This is work is tracked by [HUDI-2324](https://issues.apache.org/jira/browse/HUDI-2324) -## Building the connector +## Building the Hudi Sink Connector The first thing you need to do to start using this connector is building it. In order to do that, you need to install the following dependencies: - [Java 1.8+](https://openjdk.java.net/) - [Apache Maven](https://maven.apache.org/) +- Install [kcat](https://github.com/edenhill/kcat) -After installing these dependencies, execute the following command: +After installing these dependencies, execute the following commands. This will install all the Hudi dependency jars, +including the fat packaged jar that contains all the dependencies required for a functional Hudi Kafka Connect Sink. ```bash cd $HUDI_DIR -mvn clean package +mvn clean -DskipTests install ``` -## Incremental Builds +Henceforth, incremental builds can be performed as follows. ```bash mvn clean -pl hudi-kafka-connect install -DskipTests mvn clean -pl packaging/hudi-kafka-connect-bundle install ``` -## Put hudi connector in Kafka Connect classpath +Next, we need to make sure that the hudi sink connector bundle jar is in Kafka Connect classpath. Note that the connect +classpath should be same as the one configured in the connector configuration file. ```bash cp $HUDI_DIR/packaging/hudi-kafka-connect-bundle/target/hudi-kafka-connect-bundle-0.10.0-SNAPSHOT.jar /usr/local/share/java/hudi-kafka-connect/ @@ -52,43 +56,110 @@ After building the package, we need to install the Apache Kafka ### 1 - Starting the environment -Start the ZK and Kafka: +To try out the Connect Sink locally, set up a Kafka broker locally. Download the latest apache kafka from https://kafka.apache.org/downloads. +Once downloaded and built, run the Zookeeper server and Kafka server using the command line tools. ```bash +export KAFKA_HOME=/path/to/kafka_install_dir +cd $KAFKA_KAFKA_HOME ./bin/zookeeper-server-start.sh ./config/zookeeper.properties ./bin/kafka-server-start.sh ./config/server.properties ``` Wait until the kafka cluster is up and running. -### 2 - Create the Hudi Control Topic for Coordination of the transactions +### 2 - Set up the schema registry -The control topic should only have `1` partition +Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema +registries, we use Confluent schema registry. Download the latest confluent platform and run the schema registry +service. ```bash +cd $CONFLUENT_DIR +./bin/schema-registry-start etc/schema-registry/schema-registry.properties +``` + +### 3 - Create the Hudi Control Topic for Coordination of the transactions + +The control topic should only have `1` partition, since its used to coordinate the Hudi write transactions across the multiple Connect tasks. + +```bash +cd $KAFKA_HOME ./bin/kafka-topics.sh --delete --topic hudi-control-topic --bootstrap-server localhost:9092 ./bin/kafka-topics.sh --create --topic hudi-control-topic --partitions 1 --replication-factor 1 --bootstrap-server localhost:9092 ``` -### 3 - Create the Hudi Topic for the Sink and insert data into the topic +### 4 - Create the Hudi Topic for the Sink and insert data into the topic Open a terminal to execute the following command: ```bash -bash runKafkaTrafficGenerator.sh +cd $HUDI_DIR/hudi-kafka-connect/demo/ +bash setupKafka.sh -n ``` ### 4 - Run the Sink connector worker (multiple workers can be run) -Open a terminal to execute the following command: +The Kafka connect is a distributed platform, with the ability to run one or more workers (each running multiple tasks) +that parallely process the records from the Kafka partitions for the same topic. We provide a properties file with +default properties to start a Hudi connector. + +Note that if multiple workers need to be run, the webserver needs to be reconfigured for subsequent workers to ensure +successful running of the workers. ```bash -./bin/connect-distributed.sh ../hudi-kafka-connect/configs/connect-distributed.properties +cd $KAFKA_HOME +./bin/connect-distributed.sh $HUDI_DIR/hudi-kafka-connect/demo/connect-distributed.properties ``` ### 5- To add the Hudi Sink to the Connector (delete it if you want to re-configure) +Once the Connector has started, it will not run the Sink, until the Hudi sink is added using the web api. The following +curl APIs can be used to delete and add a new Hudi Sink. Again, a default configuration is provided for the Hudi Sink, +that can be changed based on the desired properties. + ```bash curl -X DELETE http://localhost:8083/connectors/hudi-sink -curl -X POST -H "Content-Type:application/json" -d @$HUDI-DIR/hudi-kafka-connect/configs/config-sink.json http://localhost:8083/connectors +curl -X POST -H "Content-Type:application/json" -d @$HUDI_DIR/hudi-kafka-connect/demo/config-sink.json http://localhost:8083/connectors +``` + +Now, you should see that the connector is created and tasks are running. + +```bash +curl -X GET -H "Content-Type:application/json" http://localhost:8083/connectors +["hudi-sink"] +curl -X GET -H "Content-Type:application/json" http://localhost:8083/connectors/hudi-sink/status | jq ``` + +And, you should see your Hudi table created, which you can query using Spark/Flink. +Note: HUDI-2325 tracks Hive sync, which will unlock pretty much every other query engine. + +```bash +ls -a /tmp/hoodie/hudi-test-topic +. .hoodie partition-1 partition-3 +.. partition-0 partition-2 partition-4 + +ls -lt /tmp/hoodie/hudi-test-topic/.hoodie +total 72 +-rw-r--r-- 1 user wheel 346 Sep 14 10:32 hoodie.properties +-rw-r--r-- 1 user wheel 0 Sep 13 23:18 20210913231805.inflight +-rw-r--r-- 1 user wheel 0 Sep 13 23:18 20210913231805.commit.requested +-rw-r--r-- 1 user wheel 9438 Sep 13 21:45 20210913214351.commit +-rw-r--r-- 1 user wheel 0 Sep 13 21:43 20210913214351.inflight +-rw-r--r-- 1 user wheel 0 Sep 13 21:43 20210913214351.commit.requested +-rw-r--r-- 1 user wheel 18145 Sep 13 21:43 20210913214114.commit +-rw-r--r-- 1 user wheel 0 Sep 13 21:41 20210913214114.inflight +-rw-r--r-- 1 user wheel 0 Sep 13 21:41 20210913214114.commit.requested +drwxr-xr-x 2 user wheel 64 Sep 13 21:41 archived + +ls -l /tmp/hoodie/hudi-test-topic/partition-0 +total 5168 +-rw-r--r-- 1 user wheel 439332 Sep 13 21:43 2E0E6DB44ACC8479059574A2C71C7A7E-0_0-0-0_20210913214114.parquet +-rw-r--r-- 1 user wheel 440179 Sep 13 21:42 3B56FAAAE2BDD04E480C1CBACD463D3E-0_0-0-0_20210913214114.parquet +-rw-r--r-- 1 user wheel 437097 Sep 13 21:45 3B56FAAAE2BDD04E480C1CBACD463D3E-0_0-0-0_20210913214351.parquet +-rw-r--r-- 1 user wheel 440219 Sep 13 21:42 D5AEE453699D5D9623D704C1CF399C8C-0_0-0-0_20210913214114.parquet +-rw-r--r-- 1 user wheel 437035 Sep 13 21:45 D5AEE453699D5D9623D704C1CF399C8C-0_0-0-0_20210913214351.parquet +-rw-r--r-- 1 user wheel 440214 Sep 13 21:43 E200FA75DCD1CED60BE86BCE6BF5D23A-0_0-0-0_20210913214114.parquet +``` + + diff --git a/hudi-kafka-connect/configs/config-sink.json b/hudi-kafka-connect/demo/config-sink.json similarity index 55% rename from hudi-kafka-connect/configs/config-sink.json rename to hudi-kafka-connect/demo/config-sink.json index 4e94bf5413a61..2d2be00f89358 100644 --- a/hudi-kafka-connect/configs/config-sink.json +++ b/hudi-kafka-connect/demo/config-sink.json @@ -9,11 +9,11 @@ "value.converter.schemas.enable": "false", "topics": "hudi-test-topic", "hoodie.table.name": "hudi-test-topic", - "hoodie.base.path": "file:///tmp/hoodie/sample-table", + "hoodie.table.type": "MERGE_ON_READ", + "hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic", "hoodie.datasource.write.recordkey.field": "volume", - "hoodie.datasource.write.partitionpath.field": "year", - "hoodie.schemaprovider.class": "org.apache.hudi.schema.FilebasedSchemaProvider", - "hoodie.deltastreamer.schemaprovider.source.schema.file": "file:///tmp/hoodie/schema.avsc", - "hoodie.deltastreamer.schemaprovider.target.schema.file": "file:///tmp/hoodie/schema.avsc" - } + "hoodie.datasource.write.partitionpath.field": "date", + "hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider", + "hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8081/subjects/hudi-test-topic/versions/latest" + } } diff --git a/hudi-kafka-connect/configs/connect-distributed.properties b/hudi-kafka-connect/demo/connect-distributed.properties similarity index 94% rename from hudi-kafka-connect/configs/connect-distributed.properties rename to hudi-kafka-connect/demo/connect-distributed.properties index d7d453c69dfba..9e3cec1492386 100644 --- a/hudi-kafka-connect/configs/connect-distributed.properties +++ b/hudi-kafka-connect/demo/connect-distributed.properties @@ -30,4 +30,4 @@ status.storage.replication.factor=1 offset.flush.interval.ms=60000 listeners=HTTP://:8083 -plugin.path=/usr/local/share/java,/usr/local/share/kafka/plugins,/opt/connectors, +plugin.path=/usr/local/share/java diff --git a/hudi-kafka-connect/demo/setupKafka.sh b/hudi-kafka-connect/demo/setupKafka.sh new file mode 100755 index 0000000000000..20edb1ceb2eac --- /dev/null +++ b/hudi-kafka-connect/demo/setupKafka.sh @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +######################### +# The command line help # +######################### +usage() { + echo "Usage: $0" + echo " -n |--num-kafka-records, (required) number of kafka records to generate" + echo " -f |--raw-file, (optional) raw file for the kafka records" + echo " -k |--kafka-topic, (optional) Topic name for Kafka" + echo " -m |--num-kafka-partitions, (optional) number of kafka partitions" + echo " -r |--record-key, (optional) field to use as record key" + echo " -l |--num-hudi-partitions, (optional) number of hudi partitions" + echo " -p |--partition-key, (optional) field to use as partition" + echo " -s |--schema-file, (optional) path of the file containing the schema of the records" + exit 1 +} + +case "$1" in +--help) + usage + exit 0 + ;; +esac + +if [ $# -lt 1 ]; then + echo "Illegal number of parameters" + usage + exit 0 +fi + +## defaults +rawDataFile=${HUDI_DIR}/docker/demo/data/batch_1.json +kafkaTopicName=hudi-test-topic +numKafkaPartitions=4 +recordKey=volume +numHudiPartitions=5 +partitionField=date +schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc + +while getopts ":n:f:k:m:r:l:p:s:-:" opt; do + case $opt in + n) + num_records="$OPTARG" + printf "Argument num-kafka-records is %s\n" "$num_records" + ;; + k) + rawDataFile="$OPTARG" + printf "Argument raw-file is %s\n" "$rawDataFile" + ;; + f) + kafkaTopicName="$OPTARG" + printf "Argument kafka-topic is %s\n" "$kafkaTopicName" + ;; + m) + numKafkaPartitions="$OPTARG" + printf "Argument num-kafka-partitions is %s\n" "$numKafkaPartitions" + ;; + r) + recordKey="$OPTARG" + printf "Argument record-key is %s\n" "$recordKey" + ;; + l) + numHudiPartitions="$OPTARG" + printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions" + ;; + p) + partitionField="$OPTARG" + printf "Argument partition-key is %s\n" "$partitionField" + ;; + p) + schemaFile="$OPTARG" + printf "Argument schema-file is %s\n" "$schemaFile" + ;; + -) + echo "Invalid option -$OPTARG" >&2 + ;; + esac +done + +# First delete the existing topic +${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server localhost:9092 + +# Create the topic with 4 partitions +${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server localhost:9092 + +# Setup the schema registry +export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring) +curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8081/subjects/${kafkaTopicName}/versions +curl -X GET http://localhost:8081/subjects/${kafkaTopicName}/versions/latest + +# Generate kafka messages from raw records +# Each records with unique keys and generate equal messages across each hudi partition +partitions={} +for ((i = 0; i < ${numHudiPartitions}; i++)); do + partitions[$i]="partition-"$i +done + +events_file=/tmp/kcat-input.events +rm -f ${events_file} + +recordValue=0 +num_records=$((num_records + 0)) + +for (( ; ; )); do + while IFS= read line; do + for partitionValue in "${partitions[@]}"; do + echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file} + ((recordValue = recordValue + 1)) + + if [ $recordValue -gt $num_records ]; then + break + fi + done + + if [ $recordValue -gt $num_records ]; then + break + fi + + if [ $(($recordValue % 1000)) -eq 0 ]; then + sleep 1 + fi + done <"$rawDataFile" + + if [ $recordValue -gt $num_records ]; then + break + fi +done + +grep -v '^$' ${events_file} | kcat -P -b localhost:9092 -t hudi-test-topic diff --git a/hudi-kafka-connect/pom.xml b/hudi-kafka-connect/pom.xml index 7742f3b312022..7a79f265c9713 100644 --- a/hudi-kafka-connect/pom.xml +++ b/hudi-kafka-connect/pom.xml @@ -63,6 +63,25 @@ org.apache.rat apache-rat-plugin + + com.github.os72 + protoc-jar-maven-plugin + 3.1.0.1 + + + generate-sources + + run + + + ${protoc.version} + + src/main/resources + + + + + @@ -138,6 +157,13 @@ + + + com.google.protobuf + protobuf-java + ${proto.version} + + log4j diff --git a/hudi-kafka-connect/scripts/raw.json b/hudi-kafka-connect/scripts/raw.json deleted file mode 100644 index aa2cc70374eca..0000000000000 --- a/hudi-kafka-connect/scripts/raw.json +++ /dev/null @@ -1,5 +0,0 @@ -{"volume": 0, "symbol": "TPNL", "ts": "2017-08-31 09:30:00", "month": "08", "high": 6.37, "low": 1.37, "key": "TPNL_2017-08-31 09", "year": 2017, "date": "2017/08/31", "close": 4.44, "open": 1.37, "day": "31"} -{"volume": 0, "symbol": "SPOT", "ts": "2018-08-31 09:30:00", "month": "08", "high": 1.87, "low": 0.37, "key": "TPNL_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 1.44, "open": 1.77, "day": "31"} -{"volume": 0, "symbol": "GOOG", "ts": "2019-08-31 09:30:00", "month": "08", "high": 2.1, "low": 1.7, "key": "TPNL_2019-08-31 09", "year": 2019, "date": "2019/08/31", "close": 1.94, "open": 2.0, "day": "31"} -{"volume": 0, "symbol": "MSFT", "ts": "2020-08-31 09:30:00", "month": "08", "high": 3.33, "low": 0.87, "key": "TPNL_2020-08-31 09", "year": 2020, "date": "2020/08/31", "close": 3.33, "open": 3.1, "day": "31"} -{"volume": 0, "symbol": "APPL", "ts": "2021-08-31 09:30:00", "month": "08", "high": 3.17, "low": 2.37, "key": "TPNL_2021-08-31 09", "year": 2021, "date": "2021/08/31", "close": 2.66, "open": 3.1, "day": "31"} diff --git a/hudi-kafka-connect/scripts/runKafkaTrafficGenerator.sh b/hudi-kafka-connect/scripts/runKafkaTrafficGenerator.sh deleted file mode 100644 index cff4140706af8..0000000000000 --- a/hudi-kafka-connect/scripts/runKafkaTrafficGenerator.sh +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/bin/bash - -# First delete the existing topic -$KAFKA_HOME/bin/kafka-topics.sh --delete --topic hudi-test-topic --bootstrap-server localhost:9092 - -# Create the topic with 4 partitions -$KAFKA_HOME/bin/kafka-topics.sh --create --topic hudi-test-topic --partitions 4 --replication-factor 1 --bootstrap-server localhost:9092 - -# Generate kafka messages from raw records -inputFile="raw.json" -# Generate the records with unique keys -for ((recordKey=0; recordKey<=$1; )) -do - while IFS= read line - do - echo $line | jq --argjson recordKey $recordKey -c '.volume = $recordKey' | kcat -P -b localhost:9092 -t hudi-test-topic - ((recordKey++)) - if [ $(( $recordKey % 1000 )) -eq 0 ] - then sleep 1 - fi - done < "$inputFile" -done diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java index c7dde9a2e8f23..c14a86656a6da 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java @@ -25,11 +25,13 @@ import org.apache.hudi.connect.transaction.TransactionParticipant; import org.apache.hudi.connect.writers.KafkaConnectConfigs; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; import org.apache.kafka.clients.consumer.OffsetAndMetadata; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.config.ConfigException; import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.RetriableException; import org.apache.kafka.connect.sink.SinkRecord; import org.apache.kafka.connect.sink.SinkTask; import org.apache.log4j.LogManager; @@ -49,10 +51,9 @@ public class HoodieSinkTask extends SinkTask { public static final String TASK_ID_CONFIG_NAME = "task.id"; private static final Logger LOG = LogManager.getLogger(HoodieSinkTask.class); - private static final int COORDINATOR_KAFKA_PARTITION = 0; private final Map transactionCoordinators; - private final Map hudiTransactionParticipants; + private final Map transactionParticipants; private KafkaConnectControlAgent controlKafkaClient; private KafkaConnectConfigs connectConfigs; @@ -60,8 +61,8 @@ public class HoodieSinkTask extends SinkTask { private String connectorName; public HoodieSinkTask() { - transactionCoordinators = new HashMap(); - hudiTransactionParticipants = new HashMap<>(); + transactionCoordinators = new HashMap<>(); + transactionParticipants = new HashMap<>(); } @Override @@ -80,7 +81,6 @@ public void start(Map props) { controlKafkaClient = KafkaConnectControlAgent.createKafkaControlManager( connectConfigs.getBootstrapServers(), connectConfigs.getControlTopicName()); - bootstrap(context.assignment()); } catch (ConfigException e) { throw new ConnectException("Couldn't start HdfsSinkConnector due to configuration error.", e); } catch (ConnectException e) { @@ -98,11 +98,25 @@ public void put(Collection records) { String topic = record.topic(); int partition = record.kafkaPartition(); TopicPartition tp = new TopicPartition(topic, partition); - hudiTransactionParticipants.get(tp).buffer(record); + + TransactionParticipant transactionParticipant = transactionParticipants.get(tp); + if (transactionParticipant != null) { + transactionParticipant.buffer(record); + } } for (TopicPartition partition : context.assignment()) { - hudiTransactionParticipants.get(partition).processRecords(); + if (transactionParticipants.get(partition) == null) { + throw new RetriableException("TransactionParticipant should be created for each assigned partition, " + + "but has not been created for the topic/partition: " + partition.topic() + ":" + partition.partition()); + } + try { + transactionParticipants.get(partition).processRecords(); + } catch (HoodieIOException exception) { + throw new RetriableException("Intermittent write errors for Hudi " + + " for the topic/partition: " + partition.topic() + ":" + partition.partition() + + " , ensuring kafka connect will retry ", exception); + } } } @@ -123,12 +137,9 @@ public Map preCommit(Map result = new HashMap<>(); for (TopicPartition partition : context.assignment()) { - TransactionParticipant worker = hudiTransactionParticipants.get(partition); - if (worker != null) { - worker.processRecords(); - if (worker.getLastKafkaCommittedOffset() >= 0) { - result.put(partition, new OffsetAndMetadata(worker.getLastKafkaCommittedOffset())); - } + TransactionParticipant worker = transactionParticipants.get(partition); + if (worker != null && worker.getLastKafkaCommittedOffset() >= 0) { + result.put(partition, new OffsetAndMetadata(worker.getLastKafkaCommittedOffset())); } } return result; @@ -152,13 +163,13 @@ public void close(Collection partitions) { // make sure we apply the WAL, and only reuse the temp file if the starting offset is still // valid. For now, we prefer the simpler solution that may result in a bit of wasted effort. for (TopicPartition partition : partitions) { - if (partition.partition() == COORDINATOR_KAFKA_PARTITION) { + if (partition.partition() == ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) { if (transactionCoordinators.containsKey(partition)) { transactionCoordinators.get(partition).stop(); transactionCoordinators.remove(partition); } } - TransactionParticipant worker = hudiTransactionParticipants.remove(partition); + TransactionParticipant worker = transactionParticipants.remove(partition); if (worker != null) { try { LOG.debug("Closing data writer due to task start failure."); @@ -176,7 +187,7 @@ private void bootstrap(Collection partitions) { for (TopicPartition partition : partitions) { try { // If the partition is 0, instantiate the Leader - if (partition.partition() == COORDINATOR_KAFKA_PARTITION) { + if (partition.partition() == ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) { ConnectTransactionCoordinator coordinator = new ConnectTransactionCoordinator( connectConfigs, partition, @@ -185,7 +196,7 @@ private void bootstrap(Collection partitions) { transactionCoordinators.put(partition, coordinator); } ConnectTransactionParticipant worker = new ConnectTransactionParticipant(connectConfigs, partition, controlKafkaClient, context); - hudiTransactionParticipants.put(partition, worker); + transactionParticipants.put(partition, worker); worker.start(); } catch (HoodieException exception) { LOG.error(String.format("Fatal error initializing task %s for partition %s", taskId, partition.partition()), exception); @@ -195,7 +206,7 @@ private void bootstrap(Collection partitions) { private void cleanup() { for (TopicPartition partition : context.assignment()) { - TransactionParticipant worker = hudiTransactionParticipants.get(partition); + TransactionParticipant worker = transactionParticipants.get(partition); if (worker != null) { try { LOG.debug("Closing data writer due to task start failure."); @@ -205,7 +216,7 @@ private void cleanup() { } } } - hudiTransactionParticipants.clear(); + transactionParticipants.clear(); transactionCoordinators.forEach((topic, transactionCoordinator) -> transactionCoordinator.stop()); transactionCoordinators.clear(); } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java index 536ad4a80307c..9c4674706a6a1 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java @@ -18,17 +18,13 @@ package org.apache.hudi.connect; -import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.connect.utils.KafkaConnectUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.table.FileIdPrefixProvider; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.Objects; import java.util.Properties; public class KafkaConnectFileIdPrefixProvider extends FileIdPrefixProvider { @@ -52,18 +48,9 @@ public String createFilePrefix(String partitionPath) { // We use a combination of kafka partition and partition path as the file id, and then hash it // to generate a fixed sized hash. String rawFileIdPrefix = kafkaPartition + partitionPath; - MessageDigest md; - try { - md = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - LOG.error("Fatal error selecting hash algorithm", e); - throw new HoodieException(e); - } - - byte[] digest = Objects.requireNonNull(md).digest(rawFileIdPrefix.getBytes(StandardCharsets.UTF_8)); - + String hashedPrefix = KafkaConnectUtils.hashDigest(rawFileIdPrefix); LOG.info("CreateFileId for Kafka Partition " + kafkaPartition + " : " + partitionPath + " = " + rawFileIdPrefix - + " === " + StringUtils.toHexString(digest).toUpperCase()); - return StringUtils.toHexString(digest).toUpperCase(); + + " === " + hashedPrefix); + return hashedPrefix; } } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java index a115147ae2763..776beafbd6bea 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java @@ -18,17 +18,16 @@ package org.apache.hudi.connect.kafka; -import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.ControlMessage; import org.apache.hudi.connect.transaction.TransactionCoordinator; import org.apache.hudi.connect.transaction.TransactionParticipant; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.kafka.clients.consumer.CommitFailedException; import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.common.serialization.Deserializer; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -66,7 +65,7 @@ public class KafkaConnectControlAgent implements KafkaControlAgent { // List of TransactionParticipants per Kafka Topic private final Map> partitionWorkers; private final KafkaControlProducer producer; - private KafkaConsumer consumer; + private KafkaConsumer consumer; public KafkaConnectControlAgent(String bootstrapServers, String controlTopicName) { @@ -118,7 +117,7 @@ public void deregisterTransactionCoordinator(TransactionCoordinator coordinator) } @Override - public void publishMessage(ControlEvent message) { + public void publishMessage(ControlMessage message) { producer.publishMessage(message); } @@ -128,28 +127,28 @@ private void start() { // Todo fetch the worker id or name instead of a uuid. props.put(ConsumerConfig.GROUP_ID_CONFIG, "hudi-control-group" + UUID.randomUUID().toString()); props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); - props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaJsonDeserializer.class); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); // Since we are using Kafka Control Topic as a RPC like interface, // we want consumers to only process messages that are sent after they come online props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); - consumer = new KafkaConsumer<>(props, new StringDeserializer(), - new KafkaJsonDeserializer<>(ControlEvent.class)); + consumer = new KafkaConsumer<>(props, new StringDeserializer(), new ByteArrayDeserializer()); consumer.subscribe(Collections.singletonList(controlTopicName)); executorService.submit(() -> { while (true) { - ConsumerRecords records; + ConsumerRecords records; records = consumer.poll(Duration.ofMillis(KAFKA_POLL_TIMEOUT_MS)); - for (ConsumerRecord record : records) { + for (ConsumerRecord record : records) { try { LOG.debug(String.format("Kafka consumerGroupId = %s topic = %s, partition = %s, offset = %s, customer = %s, country = %s", "", record.topic(), record.partition(), record.offset(), record.key(), record.value())); - ControlEvent message = record.value(); - String senderTopic = message.senderPartition().topic(); - if (message.getSenderType().equals(ControlEvent.SenderType.COORDINATOR)) { + ControlMessage message = ControlMessage.parseFrom(record.value()); + String senderTopic = message.getTopicName(); + + if (message.getReceiverType().equals(ControlMessage.EntityType.PARTICIPANT)) { if (partitionWorkers.containsKey(senderTopic)) { for (TransactionParticipant partitionWorker : partitionWorkers.get(senderTopic)) { partitionWorker.processControlEvent(message); @@ -157,11 +156,9 @@ private void start() { } else { LOG.warn(String.format("Failed to send message for unregistered participants for topic %s", senderTopic)); } - } else if (message.getSenderType().equals(ControlEvent.SenderType.PARTICIPANT)) { + } else if (message.getReceiverType().equals(ControlMessage.EntityType.COORDINATOR)) { if (topicCoordinators.containsKey(senderTopic)) { topicCoordinators.get(senderTopic).processControlEvent(message); - } else { - LOG.warn(String.format("Failed to send message for unregistered coordinator for topic %s", senderTopic)); } } else { LOG.warn(String.format("Sender type of Control Message unknown %s", message.getSenderType().name())); @@ -200,31 +197,4 @@ public void stop() { } } } - - /** - * Deserializes the incoming Kafka records for the Control Topic. - * - * @param represents the object that is sent over the Control Topic. - */ - public static class KafkaJsonDeserializer implements Deserializer { - - private static final Logger LOG = LogManager.getLogger(KafkaJsonDeserializer.class); - private final Class type; - - KafkaJsonDeserializer(Class type) { - this.type = type; - } - - @Override - public T deserialize(String s, byte[] bytes) { - ObjectMapper mapper = new ObjectMapper(); - T obj = null; - try { - obj = mapper.readValue(bytes, type); - } catch (Exception e) { - LOG.error(e.getMessage()); - } - return obj; - } - } } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java index ea5177eb5db3b..85b843557b1b7 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java @@ -18,7 +18,7 @@ package org.apache.hudi.connect.kafka; -import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.ControlMessage; import org.apache.hudi.connect.transaction.TransactionCoordinator; import org.apache.hudi.connect.transaction.TransactionParticipant; @@ -37,5 +37,5 @@ public interface KafkaControlAgent { void deregisterTransactionCoordinator(TransactionCoordinator coordinator); - void publishMessage(ControlEvent message); + void publishMessage(ControlMessage message); } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java index a23251e35b31f..530e57059d5e0 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java @@ -18,16 +18,13 @@ package org.apache.hudi.connect.kafka; -import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.ControlMessage; -import com.fasterxml.jackson.annotation.JsonAutoDetect; -import com.fasterxml.jackson.annotation.PropertyAccessor; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.Producer; import org.apache.kafka.clients.producer.ProducerConfig; import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.common.serialization.Serializer; +import org.apache.kafka.common.serialization.ByteArraySerializer; import org.apache.kafka.common.serialization.StringSerializer; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -45,7 +42,7 @@ public class KafkaControlProducer { private final String bootstrapServers; private final String controlTopicName; - private Producer producer; + private Producer producer; public KafkaControlProducer(String bootstrapServers, String controlTopicName) { this.bootstrapServers = bootstrapServers; @@ -57,12 +54,12 @@ private void start() { Properties props = new Properties(); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); - props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KafkaJsonSerializer.class); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); producer = new KafkaProducer<>( props, new StringSerializer(), - new KafkaJsonSerializer() + new ByteArraySerializer() ); } @@ -70,28 +67,9 @@ public void stop() { producer.close(); } - public void publishMessage(ControlEvent message) { - ProducerRecord record - = new ProducerRecord<>(controlTopicName, message.key(), message); + public void publishMessage(ControlMessage message) { + ProducerRecord record + = new ProducerRecord<>(controlTopicName, message.getType().name(), message.toByteArray()); producer.send(record); } - - public static class KafkaJsonSerializer implements Serializer { - - private static final Logger LOG = LogManager.getLogger(KafkaJsonSerializer.class); - - @Override - public byte[] serialize(String topic, ControlEvent data) { - byte[] retVal = null; - ObjectMapper objectMapper = new ObjectMapper(); - objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); - - try { - retVal = objectMapper.writeValueAsBytes(data); - } catch (Exception e) { - LOG.error("Fatal error during serialization of Kafka Control Message ", e); - } - return retVal; - } - } } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java index 13291c82791b2..7acd875b6beee 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java @@ -20,6 +20,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.connect.ControlMessage; import org.apache.hudi.connect.kafka.KafkaControlAgent; import org.apache.hudi.connect.utils.KafkaConnectUtils; import org.apache.hudi.connect.writers.ConnectTransactionServices; @@ -53,6 +54,8 @@ */ public class ConnectTransactionCoordinator implements TransactionCoordinator, Runnable { + public static final int COORDINATOR_KAFKA_PARTITION = 0; + private static final Logger LOG = LogManager.getLogger(ConnectTransactionCoordinator.class); private static final String BOOTSTRAP_SERVERS_CFG = "bootstrap.servers"; private static final String KAFKA_OFFSET_KEY = "kafka.commit.offsets"; @@ -131,6 +134,7 @@ public void start() { @Override public void stop() { kafkaControlClient.deregisterTransactionCoordinator(this); + scheduler.shutdownNow(); hasStarted.set(false); if (executorService != null) { boolean terminated = false; @@ -157,17 +161,18 @@ public TopicPartition getPartition() { } @Override - public void processControlEvent(ControlEvent message) { + public void processControlEvent(ControlMessage message) { CoordinatorEvent.CoordinatorEventType type; - if (message.getMsgType().equals(ControlEvent.MsgType.WRITE_STATUS)) { + if (message.getType().equals(ControlMessage.EventType.WRITE_STATUS)) { type = CoordinatorEvent.CoordinatorEventType.WRITE_STATUS; } else { - LOG.warn(String.format("The Coordinator should not be receiving messages of type %s", message.getMsgType().name())); + LOG.warn(String.format("The Coordinator should not be receiving messages of type %s", + message.getType().name())); return; } CoordinatorEvent event = new CoordinatorEvent(type, - message.senderPartition().topic(), + message.getTopicName(), message.getCommitTime()); event.setMessage(message); submitEvent(event); @@ -241,15 +246,7 @@ private void startNewCommit() { partitionsWriteStatusReceived.clear(); try { currentCommitTime = transactionServices.startCommit(); - ControlEvent message = new ControlEvent.Builder( - ControlEvent.MsgType.START_COMMIT, - ControlEvent.SenderType.COORDINATOR, - currentCommitTime, - partition) - .setCoordinatorInfo( - new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets)) - .build(); - kafkaControlClient.publishMessage(message); + kafkaControlClient.publishMessage(buildControlMessage(ControlMessage.EventType.START_COMMIT)); currentState = State.STARTED_COMMIT; // schedule a timeout for ending the current commit submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.END_COMMIT, @@ -267,14 +264,7 @@ private void startNewCommit() { private void endExistingCommit() { try { - ControlEvent message = new ControlEvent.Builder( - ControlEvent.MsgType.END_COMMIT, - ControlEvent.SenderType.COORDINATOR, - currentCommitTime, - partition) - .setCoordinatorInfo(new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets)) - .build(); - kafkaControlClient.publishMessage(message); + kafkaControlClient.publishMessage(buildControlMessage(ControlMessage.EventType.END_COMMIT)); } catch (Exception exception) { LOG.warn(String.format("Could not send END_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception); } @@ -288,13 +278,11 @@ private void endExistingCommit() { configs.getCoordinatorWriteTimeoutSecs(), TimeUnit.SECONDS); } - private void onReceiveWriteStatus(ControlEvent message) { - ControlEvent.ParticipantInfo participantInfo = message.getParticipantInfo(); - if (participantInfo.getOutcomeType().equals(ControlEvent.OutcomeType.WRITE_SUCCESS)) { - int partition = message.senderPartition().partition(); - partitionsWriteStatusReceived.put(partition, participantInfo.writeStatuses()); - currentConsumedKafkaOffsets.put(partition, participantInfo.getKafkaCommitOffset()); - } + private void onReceiveWriteStatus(ControlMessage message) { + ControlMessage.ParticipantInfo participantInfo = message.getParticipantInfo(); + int partition = message.getSenderPartition(); + partitionsWriteStatusReceived.put(partition, KafkaConnectUtils.getWriteStatuses(participantInfo)); + currentConsumedKafkaOffsets.put(partition, participantInfo.getKafkaOffset()); if (partitionsWriteStatusReceived.size() >= numPartitions && currentState.equals(State.ENDED_COMMIT)) { // Commit the kafka offsets to the commit file @@ -310,7 +298,7 @@ private void onReceiveWriteStatus(ControlEvent message) { currentState = State.WRITE_STATUS_RCVD; globalCommittedKafkaOffsets.putAll(currentConsumedKafkaOffsets); submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.ACK_COMMIT, - partition.topic(), + message.getTopicName(), currentCommitTime)); } catch (Exception exception) { LOG.error("Fatal error while committing file", exception); @@ -333,15 +321,7 @@ private void handleWriteStatusTimeout() { private void submitAckCommit() { try { - ControlEvent message = new ControlEvent.Builder( - ControlEvent.MsgType.ACK_COMMIT, - ControlEvent.SenderType.COORDINATOR, - currentCommitTime, - partition) - .setCoordinatorInfo( - new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets)) - .build(); - kafkaControlClient.publishMessage(message); + kafkaControlClient.publishMessage(buildControlMessage(ControlMessage.EventType.ACK_COMMIT)); } catch (Exception exception) { LOG.warn(String.format("Could not send ACK_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception); } @@ -396,4 +376,20 @@ private enum State { public interface KafkaPartitionProvider { int getLatestNumPartitions(String bootstrapServers, String topicName); } + + private ControlMessage buildControlMessage(ControlMessage.EventType eventType) { + return ControlMessage.newBuilder() + .setProtocolVersion(KafkaConnectConfigs.CURRENT_PROTOCOL_VERSION) + .setType(eventType) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.COORDINATOR) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.PARTICIPANT) + .setCommitTime(currentCommitTime) + .setCoordinatorInfo( + ControlMessage.CoordinatorInfo.newBuilder() + .putAllGlobalKafkaCommitOffsets(globalCommittedKafkaOffsets) + .build() + ).build(); + } } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java index fe1996e654e3f..19556dca45ead 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java @@ -19,11 +19,14 @@ package org.apache.hudi.connect.transaction; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.ControlMessage; import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.utils.KafkaConnectUtils; import org.apache.hudi.connect.writers.ConnectWriterProvider; import org.apache.hudi.connect.writers.KafkaConnectConfigs; import org.apache.hudi.connect.writers.KafkaConnectWriterProvider; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.connect.sink.SinkRecord; @@ -32,7 +35,6 @@ import org.apache.log4j.Logger; import java.io.IOException; -import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.concurrent.BlockingQueue; @@ -47,7 +49,7 @@ public class ConnectTransactionParticipant implements TransactionParticipant { private static final Logger LOG = LogManager.getLogger(ConnectTransactionParticipant.class); private final LinkedList buffer; - private final BlockingQueue controlEvents; + private final BlockingQueue controlEvents; private final TopicPartition partition; private final SinkTaskContext context; private final KafkaControlAgent kafkaControlAgent; @@ -96,7 +98,7 @@ public void buffer(SinkRecord record) { } @Override - public void processControlEvent(ControlEvent message) { + public void processControlEvent(ControlMessage message) { controlEvents.add(message); } @@ -113,8 +115,8 @@ public TopicPartition getPartition() { @Override public void processRecords() { while (!controlEvents.isEmpty()) { - ControlEvent message = controlEvents.poll(); - switch (message.getMsgType()) { + ControlMessage message = controlEvents.poll(); + switch (message.getType()) { case START_COMMIT: handleStartCommit(message); break; @@ -128,14 +130,14 @@ public void processRecords() { // ignore write status since its only processed by leader break; default: - throw new IllegalStateException("HudiTransactionParticipant received incorrect state " + message.getMsgType()); + throw new IllegalStateException("HudiTransactionParticipant received incorrect state " + message.getType().name()); } } writeRecords(); } - private void handleStartCommit(ControlEvent message) { + private void handleStartCommit(ControlMessage message) { // If there is an existing/ongoing transaction locally // but it failed globally since we received another START_COMMIT instead of an END_COMMIT or ACK_COMMIT, // so close it and start new transaction @@ -153,7 +155,7 @@ private void handleStartCommit(ControlEvent message) { } } - private void handleEndCommit(ControlEvent message) { + private void handleEndCommit(ControlMessage message) { if (ongoingTransactionInfo == null) { LOG.warn(String.format("END_COMMIT %s is received while we were NOT in active transaction", message.getCommitTime())); return; @@ -167,32 +169,38 @@ private void handleEndCommit(ControlEvent message) { return; } + context.pause(partition); + ongoingTransactionInfo.commitInitiated(); // send Writer Status Message and wait for ACK_COMMIT in async fashion try { - context.pause(partition); - ongoingTransactionInfo.commitInitiated(); //sendWriterStatus - List writeStatuses = new ArrayList<>(); - try { - writeStatuses = ongoingTransactionInfo.getWriter().close(); - } catch (IOException exception) { - LOG.warn("Error closing the Hudi Writer", exception); - } + List writeStatuses = ongoingTransactionInfo.getWriter().close(); + + ControlMessage writeStatusEvent = ControlMessage.newBuilder() + .setProtocolVersion(KafkaConnectConfigs.CURRENT_PROTOCOL_VERSION) + .setType(ControlMessage.EventType.WRITE_STATUS) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.PARTICIPANT) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.COORDINATOR) + .setReceiverPartition(ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) + .setCommitTime(ongoingTransactionInfo.getCommitTime()) + .setParticipantInfo( + ControlMessage.ParticipantInfo.newBuilder() + .setWriteStatus(KafkaConnectUtils.buildWriteStatuses(writeStatuses)) + .setKafkaOffset(ongoingTransactionInfo.getLastWrittenKafkaOffset()) + .build() + ).build(); - ControlEvent writeStatus = new ControlEvent.Builder(ControlEvent.MsgType.WRITE_STATUS, - ControlEvent.SenderType.PARTICIPANT, ongoingTransactionInfo.getCommitTime(), partition) - .setParticipantInfo(new ControlEvent.ParticipantInfo( - writeStatuses, - ongoingTransactionInfo.getLastWrittenKafkaOffset(), - ControlEvent.OutcomeType.WRITE_SUCCESS)) - .build(); - kafkaControlAgent.publishMessage(writeStatus); + kafkaControlAgent.publishMessage(writeStatusEvent); } catch (Exception exception) { - LOG.warn(String.format("Error ending commit %s for partition %s", message.getCommitTime(), partition.partition()), exception); + LOG.error(String.format("Error writing records and ending commit %s for partition %s", message.getCommitTime(), partition.partition()), exception); + throw new HoodieIOException(String.format("Error writing records and ending commit %s for partition %s", message.getCommitTime(), partition.partition()), + new IOException(exception)); } } - private void handleAckCommit(ControlEvent message) { + private void handleAckCommit(ControlMessage message) { // Update lastKafkCommitedOffset locally. if (ongoingTransactionInfo != null && committedKafkaOffset < ongoingTransactionInfo.getLastWrittenKafkaOffset()) { committedKafkaOffset = ongoingTransactionInfo.getLastWrittenKafkaOffset(); @@ -230,15 +238,15 @@ private void cleanupOngoingTransaction() { try { ongoingTransactionInfo.getWriter().close(); ongoingTransactionInfo = null; - } catch (IOException exception) { + } catch (HoodieIOException exception) { LOG.warn("Error received while trying to cleanup existing transaction", exception); } } } - private void syncKafkaOffsetWithLeader(ControlEvent message) { - if (message.getCoordinatorInfo() != null) { - Long coordinatorCommittedKafkaOffset = message.getCoordinatorInfo().getGlobalKafkaCommitOffsets().get(partition.partition()); + private void syncKafkaOffsetWithLeader(ControlMessage message) { + if (message.getCoordinatorInfo().getGlobalKafkaCommitOffsetsMap().containsKey(partition.partition())) { + Long coordinatorCommittedKafkaOffset = message.getCoordinatorInfo().getGlobalKafkaCommitOffsetsMap().get(partition.partition()); // Recover kafka committed offsets, treating the commit offset from the coordinator // as the source of truth if (coordinatorCommittedKafkaOffset != null && coordinatorCommittedKafkaOffset >= 0) { diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ControlEvent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ControlEvent.java deleted file mode 100644 index 093064881b663..0000000000000 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ControlEvent.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.connect.transaction; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.util.SerializationUtils; - -import org.apache.kafka.common.TopicPartition; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; - -/** - * The events sent over the Kafka Control Topic between the - * coordinator and the followers, in order to ensure - * coordination across all the writes. - */ -@SuppressWarnings("checkstyle:VisibilityModifier") -public class ControlEvent implements Serializable { - - private static final Logger LOG = LogManager.getLogger(ControlEvent.class); - private static final int CURRENT_VERSION = 0; - - private final int version = CURRENT_VERSION; - private MsgType msgType; - private SenderType senderType; - private String commitTime; - private byte[] senderPartition; - private CoordinatorInfo coordinatorInfo; - private ParticipantInfo participantInfo; - - public ControlEvent() { - } - - public ControlEvent(MsgType msgType, - SenderType senderType, - String commitTime, - byte[] senderPartition, - CoordinatorInfo coordinatorInfo, - ParticipantInfo participantInfo) { - this.msgType = msgType; - this.senderType = senderType; - this.commitTime = commitTime; - this.senderPartition = senderPartition; - this.coordinatorInfo = coordinatorInfo; - this.participantInfo = participantInfo; - } - - public String key() { - return msgType.name().toLowerCase(Locale.ROOT); - } - - public MsgType getMsgType() { - return msgType; - } - - public SenderType getSenderType() { - return senderType; - } - - public String getCommitTime() { - return commitTime; - } - - public byte[] getSenderPartition() { - return senderPartition; - } - - public TopicPartition senderPartition() { - return SerializationUtils.deserialize(senderPartition); - } - - public CoordinatorInfo getCoordinatorInfo() { - return coordinatorInfo; - } - - public ParticipantInfo getParticipantInfo() { - return participantInfo; - } - - public int getVersion() { - return version; - } - - @Override - public String toString() { - return String.format("%s %s %s %s %s %s", version, msgType.name(), commitTime, - Arrays.toString(senderPartition), coordinatorInfo.toString(), participantInfo.toString()); - } - - /** - * Builder that helps build {@link ControlEvent}. - */ - public static class Builder { - - private final MsgType msgType; - private SenderType senderType; - private final String commitTime; - private final byte[] senderPartition; - private CoordinatorInfo coordinatorInfo; - private ParticipantInfo participantInfo; - - public Builder(MsgType msgType, SenderType senderType, String commitTime, TopicPartition senderPartition) throws IOException { - this.msgType = msgType; - this.senderType = senderType; - this.commitTime = commitTime; - this.senderPartition = SerializationUtils.serialize(senderPartition); - } - - public Builder setCoordinatorInfo(CoordinatorInfo coordinatorInfo) { - this.coordinatorInfo = coordinatorInfo; - return this; - } - - public Builder setParticipantInfo(ParticipantInfo participantInfo) { - this.participantInfo = participantInfo; - return this; - } - - public ControlEvent build() { - return new ControlEvent(msgType, senderType, commitTime, senderPartition, coordinatorInfo, participantInfo); - } - } - - /** - * The info sent by the {@link TransactionCoordinator} to one or more - * {@link TransactionParticipant}s. - */ - public static class CoordinatorInfo implements Serializable { - - private Map globalKafkaCommitOffsets; - - public CoordinatorInfo() { - } - - public CoordinatorInfo(Map globalKafkaCommitOffsets) { - this.globalKafkaCommitOffsets = globalKafkaCommitOffsets; - } - - public Map getGlobalKafkaCommitOffsets() { - return (globalKafkaCommitOffsets == null) ? new HashMap<>() : globalKafkaCommitOffsets; - } - } - - /** - * The info sent by a {@link TransactionParticipant} instances to the - * {@link TransactionCoordinator}. - */ - public static class ParticipantInfo implements Serializable { - - private byte[] writeStatusList; - private long kafkaCommitOffset; - private OutcomeType outcomeType; - - public ParticipantInfo() { - } - - public ParticipantInfo(List writeStatuses, long kafkaCommitOffset, OutcomeType outcomeType) throws IOException { - this.writeStatusList = SerializationUtils.serialize(writeStatuses); - this.kafkaCommitOffset = kafkaCommitOffset; - this.outcomeType = outcomeType; - } - - public byte[] getWriteStatusList() { - return writeStatusList; - } - - public List writeStatuses() { - return SerializationUtils.deserialize(writeStatusList); - } - - public long getKafkaCommitOffset() { - return kafkaCommitOffset; - } - - public OutcomeType getOutcomeType() { - return outcomeType; - } - } - - /** - * Type of Control Event. - */ - public enum MsgType { - START_COMMIT, - END_COMMIT, - ACK_COMMIT, - WRITE_STATUS, - } - - public enum SenderType { - COORDINATOR, - PARTICIPANT - } - - public enum OutcomeType { - WRITE_SUCCESS, - } -} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java index a0e2654cdeaa2..f9f467a83bec7 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java @@ -18,6 +18,8 @@ package org.apache.hudi.connect.transaction; +import org.apache.hudi.connect.ControlMessage; + /** * The events within the Coordinator that trigger * the state changes in the state machine of @@ -28,7 +30,7 @@ public class CoordinatorEvent { private final CoordinatorEventType eventType; private final String topicName; private final String commitTime; - private ControlEvent message; + private ControlMessage message; public CoordinatorEvent(CoordinatorEventType eventType, String topicName, @@ -50,11 +52,11 @@ public String getCommitTime() { return commitTime; } - public ControlEvent getMessage() { + public ControlMessage getMessage() { return message; } - public void setMessage(ControlEvent message) { + public void setMessage(ControlMessage message) { this.message = message; } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java index 04f8a2e3c7a52..d6759d84c8858 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java @@ -18,6 +18,8 @@ package org.apache.hudi.connect.transaction; +import org.apache.hudi.connect.ControlMessage; + import org.apache.kafka.common.TopicPartition; /** @@ -36,5 +38,5 @@ public interface TransactionCoordinator { TopicPartition getPartition(); /* Called when a control event is received from the Kafka control topic */ - void processControlEvent(ControlEvent message); + void processControlEvent(ControlMessage message); } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java index 0179f3b717622..d27b14ef47644 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java @@ -18,6 +18,8 @@ package org.apache.hudi.connect.transaction; +import org.apache.hudi.connect.ControlMessage; + import org.apache.kafka.common.TopicPartition; import org.apache.kafka.connect.sink.SinkRecord; @@ -39,7 +41,7 @@ public interface TransactionParticipant { TopicPartition getPartition(); - void processControlEvent(ControlEvent message); + void processControlEvent(ControlMessage message); long getLastKafkaCommittedOffset(); } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java index 593cfb1241d05..3c77063ddf2fd 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java @@ -18,6 +18,7 @@ package org.apache.hudi.connect.utils; +import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; @@ -26,6 +27,9 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SerializationUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.connect.ControlMessage; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.keygen.CustomAvroKeyGenerator; @@ -33,6 +37,7 @@ import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import com.google.protobuf.ByteString; import org.apache.hadoop.conf.Configuration; import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.DescribeTopicsResult; @@ -41,8 +46,14 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.util.Arrays; +import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Properties; import java.util.stream.Collectors; @@ -71,6 +82,7 @@ public static int getLatestNumPartitions(String bootstrapServers, String topicNa /** * Returns the default Hadoop Configuration. + * * @return */ public static Configuration getDefaultHadoopConf() { @@ -81,8 +93,9 @@ public static Configuration getDefaultHadoopConf() { /** * Extract the record fields. + * * @param keyGenerator key generator Instance of the keygenerator. - * @return Returns the record key columns seprarated by comma. + * @return Returns the record key columns separated by comma. */ public static String getRecordKeyColumns(KeyGenerator keyGenerator) { return String.join(",", keyGenerator.getRecordKeyFieldNames()); @@ -92,9 +105,9 @@ public static String getRecordKeyColumns(KeyGenerator keyGenerator) { * Extract partition columns directly if an instance of class {@link BaseKeyGenerator}, * else extract partition columns from the properties. * - * @param keyGenerator key generator Instance of the keygenerator. + * @param keyGenerator key generator Instance of the keygenerator. * @param typedProperties properties from the config. - * @return partition columns Returns the partition columns seprarated by comma. + * @return partition columns Returns the partition columns separated by comma. */ public static String getPartitionColumns(KeyGenerator keyGenerator, TypedProperties typedProperties) { @@ -137,4 +150,44 @@ public static Option getCommitMetadataForLatestInstant(Hoo return Option.empty(); } } + + public static String hashDigest(String stringToHash) { + MessageDigest md; + try { + md = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + LOG.error("Fatal error selecting hash algorithm", e); + throw new HoodieException(e); + } + byte[] digest = Objects.requireNonNull(md).digest(stringToHash.getBytes(StandardCharsets.UTF_8)); + return StringUtils.toHexString(digest).toUpperCase(); + } + + /** + * Build Protobuf message containing the Hudi {@link WriteStatus}. + * + * @param writeStatuses The list of Hudi {@link WriteStatus}. + * @return the protobuf message {@link org.apache.hudi.connect.ControlMessage.ConnectWriteStatus} + * that wraps the Hudi {@link WriteStatus}. + * @throws IOException thrown if the conversion failed. + */ + public static ControlMessage.ConnectWriteStatus buildWriteStatuses(List writeStatuses) throws IOException { + return ControlMessage.ConnectWriteStatus.newBuilder() + .setSerializedWriteStatus( + ByteString.copyFrom( + SerializationUtils.serialize(writeStatuses))) + .build(); + } + + /** + * Unwrap the Hudi {@link WriteStatus} from the received Protobuf message. + * + * @param participantInfo The {@link ControlMessage.ParticipantInfo} that contains the + * underlying {@link WriteStatus} sent by the participants. + * @return the list of {@link WriteStatus} returned by Hudi on a write transaction. + */ + public static List getWriteStatuses(ControlMessage.ParticipantInfo participantInfo) { + ControlMessage.ConnectWriteStatus connectWriteStatus = participantInfo.getWriteStatus(); + return SerializationUtils.deserialize(connectWriteStatus.getSerializedWriteStatus().toByteArray()); + } } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java index c958b2b4868ee..a579484f67369 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java @@ -21,7 +21,9 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.utils.KafkaConnectUtils; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.AvroConvertor; @@ -46,17 +48,19 @@ public abstract class AbstractConnectWriter implements ConnectWriter(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord)); + // Tag records with a file ID based on kafka partition and hudi partition. + HoodieRecord hoodieRecord = new HoodieRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord)); + String fileId = KafkaConnectUtils.hashDigest(String.format("%s-%s", record.kafkaPartition(), hoodieRecord.getPartitionPath())); + hoodieRecord.unseal(); + hoodieRecord.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId)); + hoodieRecord.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); + hoodieRecord.seal(); writeHudiRecord(hoodieRecord); } @Override public List close() { - return flushHudiRecords(); + return flushRecords(); } - protected abstract void writeHudiRecord(HoodieRecord record); + protected abstract void writeHudiRecord(HoodieRecord record); - protected abstract List flushHudiRecords(); + protected abstract List flushRecords(); } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java index 3319604b502e6..0e92e674d42da 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java @@ -21,14 +21,14 @@ import org.apache.hudi.client.HoodieJavaWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.HoodieRecordSizeEstimator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.IOUtils; import org.apache.hudi.keygen.KeyGenerator; @@ -40,8 +40,8 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; -import java.util.stream.Collectors; /** * Specific implementation of a Hudi Writer that buffers all incoming records, @@ -53,9 +53,8 @@ public class BufferedConnectWriter extends AbstractConnectWriter { private final HoodieEngineContext context; private final HoodieJavaWriteClient writeClient; - private final String instantTime; private final HoodieWriteConfig config; - private ExternalSpillableMap> bufferedRecords; + private ExternalSpillableMap> bufferedRecords; public BufferedConnectWriter(HoodieEngineContext context, HoodieJavaWriteClient writeClient, @@ -64,10 +63,9 @@ public BufferedConnectWriter(HoodieEngineContext context, HoodieWriteConfig config, KeyGenerator keyGenerator, SchemaProvider schemaProvider) { - super(connectConfigs, keyGenerator, schemaProvider); + super(connectConfigs, keyGenerator, schemaProvider, instantTime); this.context = context; this.writeClient = writeClient; - this.instantTime = instantTime; this.config = config; init(); } @@ -89,12 +87,12 @@ private void init() { } @Override - public void writeHudiRecord(HoodieRecord record) { + public void writeHudiRecord(HoodieRecord record) { bufferedRecords.put(record.getRecordKey(), record); } @Override - public List flushHudiRecords() { + public List flushRecords() { try { LOG.info("Number of entries in MemoryBasedMap => " + bufferedRecords.getInMemoryMapNumEntries() @@ -103,18 +101,28 @@ public List flushHudiRecords() { + bufferedRecords.getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + bufferedRecords.getSizeOfFileOnDiskInBytes()); List writeStatuses = new ArrayList<>(); + + boolean isMorTable = Option.ofNullable(connectConfigs.getString(HoodieTableConfig.TYPE)) + .map(t -> t.equals(HoodieTableType.MERGE_ON_READ.name())) + .orElse(false); + // Write out all records if non-empty if (!bufferedRecords.isEmpty()) { - writeStatuses = writeClient.bulkInsertPreppedRecords( - bufferedRecords.values().stream().collect(Collectors.toList()), - instantTime, Option.empty()); + if (isMorTable) { + writeStatuses = writeClient.upsertPreppedRecords( + new LinkedList<>(bufferedRecords.values()), + instantTime); + } else { + writeStatuses = writeClient.bulkInsertPreppedRecords( + new LinkedList<>(bufferedRecords.values()), + instantTime, Option.empty()); + } } bufferedRecords.close(); - LOG.info("Flushed hudi records and got writeStatuses: " - + writeStatuses); + LOG.info("Flushed hudi records and got writeStatuses: " + writeStatuses); return writeStatuses; } catch (Exception e) { - throw new HoodieException("Write records failed", e); + throw new HoodieIOException("Write records failed", new IOException(e)); } } } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java index a90d72a4500b8..7249d4758ce38 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java @@ -27,5 +27,5 @@ public interface ConnectWriter { void writeRecord(SinkRecord record) throws IOException; - List close() throws IOException; + List close(); } diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java index ae6b5d1d3d73f..773ce1e048a7d 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java @@ -36,9 +36,10 @@ @Immutable @ConfigClassProperty(name = "Kafka Sink Connect Configurations", groupName = ConfigGroups.Names.KAFKA_CONNECT, - description = "Configurations for Kakfa Connect Sink Connector for Hudi.") + description = "Configurations for Kafka Connect Sink Connector for Hudi.") public class KafkaConnectConfigs extends HoodieConfig { + public static final int CURRENT_PROTOCOL_VERSION = 0; public static final String KAFKA_VALUE_CONVERTER = "value.converter"; public static final ConfigProperty KAFKA_BOOTSTRAP_SERVERS = ConfigProperty @@ -67,7 +68,7 @@ public class KafkaConnectConfigs extends HoodieConfig { public static final ConfigProperty COORDINATOR_WRITE_TIMEOUT_SECS = ConfigProperty .key("hoodie.kafka.coordinator.write.timeout.secs") - .defaultValue("60") + .defaultValue("300") .withDocumentation("The timeout after sending an END_COMMIT until when " + "the coordinator will wait for the write statuses from all the partitions" + "to ignore the current commit and start a new commit."); diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java index ad40ebcb7deb1..8039e56d37ba5 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java @@ -23,12 +23,10 @@ import org.apache.hudi.client.common.HoodieJavaEngineContext; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.connect.transaction.TransactionCoordinator; @@ -38,7 +36,6 @@ import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -54,19 +51,16 @@ public class KafkaConnectTransactionServices implements ConnectTransactionServices { private static final Logger LOG = LogManager.getLogger(KafkaConnectTransactionServices.class); - private static final String TABLE_FORMAT = "PARQUET"; private final Option tableMetaClient; private final Configuration hadoopConf; - private final FileSystem fs; private final String tableBasePath; private final String tableName; private final HoodieEngineContext context; private final HoodieJavaWriteClient javaClient; - public KafkaConnectTransactionServices( - KafkaConnectConfigs connectConfigs) throws HoodieException { + public KafkaConnectTransactionServices(KafkaConnectConfigs connectConfigs) throws HoodieException { HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() .withProperties(connectConfigs.getProps()).build(); @@ -74,29 +68,25 @@ public KafkaConnectTransactionServices( tableName = writeConfig.getTableName(); hadoopConf = KafkaConnectUtils.getDefaultHadoopConf(); context = new HoodieJavaEngineContext(hadoopConf); - fs = FSUtils.getFs(tableBasePath, hadoopConf); try { KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator( new TypedProperties(connectConfigs.getProps())); - String recordKeyFields = KafkaConnectUtils.getRecordKeyColumns(keyGenerator); String partitionColumns = KafkaConnectUtils.getPartitionColumns(keyGenerator, new TypedProperties(connectConfigs.getProps())); - LOG.info(String.format("Setting record key %s and partitionfields %s for table %s", - recordKeyFields, - partitionColumns, - tableBasePath + tableName)); + LOG.info(String.format("Setting record key %s and partition fields %s for table %s", + recordKeyFields, partitionColumns, tableBasePath + tableName)); tableMetaClient = Option.of(HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.COPY_ON_WRITE.name()) .setTableName(tableName) .setPayloadClassName(HoodieAvroPayload.class.getName()) - .setBaseFileFormat(TABLE_FORMAT) .setRecordKeyFields(recordKeyFields) .setPartitionFields(partitionColumns) .setKeyGeneratorClassProp(writeConfig.getKeyGeneratorClass()) + .fromProperties(connectConfigs.getProps()) .initTable(hadoopConf, tableBasePath)); javaClient = new HoodieJavaWriteClient<>(context, writeConfig); @@ -113,8 +103,7 @@ public String startCommit() { } public void endCommit(String commitTime, List writeStatuses, Map extraMetadata) { - javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata), - HoodieActiveTimeline.COMMIT_ACTION, Collections.emptyMap()); + javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata)); LOG.info("Ending Hudi commit " + commitTime); } diff --git a/hudi-kafka-connect/src/main/resources/ControlMessage.proto b/hudi-kafka-connect/src/main/resources/ControlMessage.proto new file mode 100644 index 0000000000000..5059897c3fe80 --- /dev/null +++ b/hudi-kafka-connect/src/main/resources/ControlMessage.proto @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +option java_multiple_files = true; +option java_package = "org.apache.hudi.connect"; +option java_outer_classname = "ConnectControl"; + +package connect; + +message ControlMessage { + uint32 protocolVersion = 1; + EventType type = 2; + string topic_name = 3; + EntityType sender_type = 4; + uint32 sender_partition = 5; + EntityType receiver_type = 6; + uint32 receiver_partition = 7; + string commitTime = 8; + oneof payload { + CoordinatorInfo coordinator_info = 9; + ParticipantInfo participant_info = 10; + } + + message CoordinatorInfo { + map globalKafkaCommitOffsets = 1; + } + + message ParticipantInfo { + ConnectWriteStatus writeStatus = 1; + uint64 kafkaOffset = 2; + } + + message ConnectWriteStatus { + bytes serializedWriteStatus = 1; + } + + enum EventType { + START_COMMIT = 0; + END_COMMIT = 1; + ACK_COMMIT = 2; + WRITE_STATUS = 3; + } + + enum EntityType { + COORDINATOR = 0; + PARTICIPANT = 1; + } +} diff --git a/hudi-kafka-connect/src/main/resources/log4j.properties b/hudi-kafka-connect/src/main/resources/log4j.properties new file mode 100644 index 0000000000000..ff268faf6363c --- /dev/null +++ b/hudi-kafka-connect/src/main/resources/log4j.properties @@ -0,0 +1,23 @@ +### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### +log4j.rootLogger=INFO, A1 +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java index 21940ab43dc93..6e049c6118b09 100644 --- a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java @@ -22,9 +22,9 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; import org.apache.hudi.connect.transaction.ConnectTransactionCoordinator; -import org.apache.hudi.connect.transaction.ControlEvent; import org.apache.hudi.connect.transaction.TransactionCoordinator; import org.apache.hudi.connect.transaction.TransactionParticipant; +import org.apache.hudi.connect.utils.KafkaConnectUtils; import org.apache.hudi.connect.writers.KafkaConnectConfigs; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.helper.MockConnectTransactionServices; @@ -108,7 +108,7 @@ private static class MockParticipant implements TransactionParticipant { private final int maxNumberCommitRounds; private final Map kafkaOffsetsCommitted; - private ControlEvent.MsgType expectedMsgType; + private ControlMessage.EventType expectedMsgType; private int numberCommitRounds; public MockParticipant(MockKafkaControlAgent kafkaControlAgent, @@ -121,7 +121,7 @@ public MockParticipant(MockKafkaControlAgent kafkaControlAgent, this.maxNumberCommitRounds = maxNumberCommitRounds; this.partition = new TopicPartition(TOPIC_NAME, (NUM_PARTITIONS - 1)); this.kafkaOffsetsCommitted = new HashMap<>(); - expectedMsgType = ControlEvent.MsgType.START_COMMIT; + expectedMsgType = ControlMessage.EventType.START_COMMIT; numberCommitRounds = 0; } @@ -149,9 +149,9 @@ public TopicPartition getPartition() { } @Override - public void processControlEvent(ControlEvent message) { - assertEquals(message.getSenderType(), ControlEvent.SenderType.COORDINATOR); - assertEquals(message.senderPartition().topic(), partition.topic()); + public void processControlEvent(ControlMessage message) { + assertEquals(message.getSenderType(), ControlMessage.EntityType.COORDINATOR); + assertEquals(message.getTopicName(), partition.topic()); testScenarios(message); } @@ -160,24 +160,24 @@ public long getLastKafkaCommittedOffset() { return 0; } - private void testScenarios(ControlEvent message) { - assertEquals(expectedMsgType, message.getMsgType()); + private void testScenarios(ControlMessage message) { + assertEquals(expectedMsgType, message.getType()); - switch (message.getMsgType()) { + switch (message.getType()) { case START_COMMIT: - expectedMsgType = ControlEvent.MsgType.END_COMMIT; + expectedMsgType = ControlMessage.EventType.END_COMMIT; break; case END_COMMIT: assertEquals(kafkaOffsetsCommitted, message.getCoordinatorInfo().getGlobalKafkaCommitOffsets()); int numSuccessPartitions; Map kafkaOffsets = new HashMap<>(); - List controlEvents = new ArrayList<>(); + List controlEvents = new ArrayList<>(); // Prepare the WriteStatuses for all partitions for (int i = 1; i <= NUM_PARTITIONS; i++) { try { long kafkaOffset = (long) (Math.random() * 10000); kafkaOffsets.put(i, kafkaOffset); - ControlEvent event = successWriteStatus( + ControlMessage event = successWriteStatus( message.getCommitTime(), new TopicPartition(TOPIC_NAME, i), kafkaOffset); @@ -191,11 +191,11 @@ private void testScenarios(ControlEvent message) { case ALL_CONNECT_TASKS_SUCCESS: numSuccessPartitions = NUM_PARTITIONS; kafkaOffsetsCommitted.putAll(kafkaOffsets); - expectedMsgType = ControlEvent.MsgType.ACK_COMMIT; + expectedMsgType = ControlMessage.EventType.ACK_COMMIT; break; case SUBSET_CONNECT_TASKS_FAILED: numSuccessPartitions = NUM_PARTITIONS / 2; - expectedMsgType = ControlEvent.MsgType.START_COMMIT; + expectedMsgType = ControlMessage.EventType.START_COMMIT; break; default: throw new HoodieException("Unknown test scenario " + testScenario); @@ -210,18 +210,18 @@ private void testScenarios(ControlEvent message) { if (numberCommitRounds >= maxNumberCommitRounds) { latch.countDown(); } - expectedMsgType = ControlEvent.MsgType.START_COMMIT; + expectedMsgType = ControlMessage.EventType.START_COMMIT; break; default: - throw new HoodieException("Illegal control message type " + message.getMsgType()); + throw new HoodieException("Illegal control message type " + message.getType()); } - if (message.getMsgType().equals(ControlEvent.MsgType.START_COMMIT)) { + if (message.getType().equals(ControlMessage.EventType.START_COMMIT)) { if (numberCommitRounds >= maxNumberCommitRounds) { latch.countDown(); } numberCommitRounds++; - expectedMsgType = ControlEvent.MsgType.END_COMMIT; + expectedMsgType = ControlMessage.EventType.END_COMMIT; } } @@ -230,24 +230,29 @@ public enum TestScenarios { ALL_CONNECT_TASKS_SUCCESS } - private static ControlEvent successWriteStatus(String commitTime, - TopicPartition partition, - long kafkaOffset) throws Exception { + private static ControlMessage successWriteStatus(String commitTime, + TopicPartition partition, + long kafkaOffset) throws Exception { // send WS WriteStatus writeStatus = new WriteStatus(); WriteStatus status = new WriteStatus(false, 1.0); for (int i = 0; i < 1000; i++) { status.markSuccess(mock(HoodieRecord.class), Option.empty()); } - return new ControlEvent.Builder(ControlEvent.MsgType.WRITE_STATUS, - ControlEvent.SenderType.PARTICIPANT, - commitTime, - partition) - .setParticipantInfo(new ControlEvent.ParticipantInfo( - Collections.singletonList(writeStatus), - kafkaOffset, - ControlEvent.OutcomeType.WRITE_SUCCESS)) - .build(); + return ControlMessage.newBuilder() + .setType(ControlMessage.EventType.WRITE_STATUS) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.PARTICIPANT) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.COORDINATOR) + .setReceiverPartition(ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) + .setCommitTime(commitTime) + .setParticipantInfo( + ControlMessage.ParticipantInfo.newBuilder() + .setWriteStatus(KafkaConnectUtils.buildWriteStatuses(Collections.singletonList(writeStatus))) + .setKafkaOffset(kafkaOffset) + .build() + ).build(); } } } diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java index 900ba46f7fbc2..5d551a79fa03d 100644 --- a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java @@ -21,7 +21,6 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.connect.kafka.KafkaControlAgent; import org.apache.hudi.connect.transaction.ConnectTransactionParticipant; -import org.apache.hudi.connect.transaction.ControlEvent; import org.apache.hudi.connect.transaction.TransactionCoordinator; import org.apache.hudi.connect.writers.KafkaConnectConfigs; import org.apache.hudi.exception.HoodieException; @@ -68,44 +67,47 @@ public void setUp() throws Exception { @EnumSource(value = CoordinatorFailureTestScenarios.class) public void testAllCoordinatorFailureScenarios(CoordinatorFailureTestScenarios testScenario) { int expectedRecordsWritten = 0; - switch (testScenario) { - case REGULAR_SCENARIO: - expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); - assertTrue(testKafkaConnect.isPaused()); - break; - case COORDINATOR_FAILED_AFTER_START_COMMIT: - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - // Coordinator Failed - initializeCoordinator(); - break; - case COORDINATOR_FAILED_AFTER_END_COMMIT: - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); - expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); - // Coordinator Failed - initializeCoordinator(); - break; - default: - throw new HoodieException("Unknown test scenario " + testScenario); - } - - // Regular Case or Coordinator Recovery Case - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); - expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); - assertTrue(testKafkaConnect.isResumed()); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertTrue(testKafkaConnect.isPaused()); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); - // Ensure Coordinator and participant are in sync in the kafka offsets - assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + try { + switch (testScenario) { + case REGULAR_SCENARIO: + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + break; + case COORDINATOR_FAILED_AFTER_START_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + // Coordinator Failed + initializeCoordinator(); + break; + case COORDINATOR_FAILED_AFTER_END_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.END_COMMIT); + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + // Coordinator Failed + initializeCoordinator(); + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + // Regular Case or Coordinator Recovery Case + coordinator.sendEventFromCoordinator(ControlMessage.EventType.START_COMMIT); + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isResumed()); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + } catch (Exception exception) { + throw new HoodieException("Unexpected test failure ", exception); + } participant.stop(); } @@ -113,59 +115,63 @@ public void testAllCoordinatorFailureScenarios(CoordinatorFailureTestScenarios t @EnumSource(value = ParticipantFailureTestScenarios.class) public void testAllParticipantFailureScenarios(ParticipantFailureTestScenarios testScenario) { int expectedRecordsWritten = 0; - switch (testScenario) { - case FAILURE_BEFORE_START_COMMIT: - testKafkaConnect.putRecordsToParticipant(); - // Participant fails - initializeParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); - expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); - assertTrue(testKafkaConnect.isResumed()); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertTrue(testKafkaConnect.isPaused()); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); - // Ensure Coordinator and participant are in sync in the kafka offsets - assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); - break; - case FAILURE_AFTER_START_COMMIT: - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - // Participant fails - initializeParticipant(); - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertTrue(testKafkaConnect.isPaused()); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); - // Ensure Coordinator and participant are in sync in the kafka offsets - assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); - break; - case FAILURE_AFTER_END_COMMIT: - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - // Participant fails - initializeParticipant(); - testKafkaConnect.putRecordsToParticipant(); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertTrue(testKafkaConnect.isPaused()); - coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); - testKafkaConnect.putRecordsToParticipant(); - assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); - // Ensure Coordinator and participant are in sync in the kafka offsets - assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); - break; - default: - throw new HoodieException("Unknown test scenario " + testScenario); + try { + switch (testScenario) { + case FAILURE_BEFORE_START_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + // Participant fails + initializeParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.START_COMMIT); + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isResumed()); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + break; + case FAILURE_AFTER_START_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + // Participant fails + initializeParticipant(); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + break; + case FAILURE_AFTER_END_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + // Participant fails + initializeParticipant(); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlMessage.EventType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + } catch (Exception exception) { + throw new HoodieException("Unexpected test failure ", exception); } } @@ -196,7 +202,7 @@ private static class MockCoordinator implements TransactionCoordinator { private final KafkaControlAgent kafkaControlAgent; private final TopicPartition partition; - private Option lastReceivedWriteStatusEvent; + private Option lastReceivedWriteStatusEvent; private long committedKafkaOffset; public MockCoordinator(KafkaControlAgent kafkaControlAgent) { @@ -206,26 +212,30 @@ public MockCoordinator(KafkaControlAgent kafkaControlAgent) { committedKafkaOffset = 0L; } - public void sendEventFromCoordinator( - ControlEvent.MsgType type) { + public void sendEventFromCoordinator(ControlMessage.EventType type) { try { - if (type.equals(ControlEvent.MsgType.START_COMMIT)) { + if (type.equals(ControlMessage.EventType.START_COMMIT)) { ++currentCommitTime; } - kafkaControlAgent.publishMessage(new ControlEvent.Builder( - type, - ControlEvent.SenderType.COORDINATOR, - String.valueOf(currentCommitTime), - partition) - .setCoordinatorInfo(new ControlEvent.CoordinatorInfo( - Collections.singletonMap(PARTITION_NUMBER, committedKafkaOffset))) - .build()); + kafkaControlAgent.publishMessage( + ControlMessage.newBuilder() + .setType(type) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.COORDINATOR) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.PARTICIPANT) + .setCommitTime(String.valueOf(currentCommitTime)) + .setCoordinatorInfo( + ControlMessage.CoordinatorInfo.newBuilder() + .putAllGlobalKafkaCommitOffsets(Collections.singletonMap(PARTITION_NUMBER, committedKafkaOffset)) + .build() + ).build()); } catch (Exception exception) { throw new HoodieException("Fatal error sending control event to Participant"); } } - public Option getLastReceivedWriteStatusEvent() { + public Option getLastReceivedWriteStatusEvent() { return lastReceivedWriteStatusEvent; } @@ -249,11 +259,11 @@ public TopicPartition getPartition() { } @Override - public void processControlEvent(ControlEvent message) { - if (message.getMsgType().equals(ControlEvent.MsgType.WRITE_STATUS)) { + public void processControlEvent(ControlMessage message) { + if (message.getType().equals(ControlMessage.EventType.WRITE_STATUS)) { lastReceivedWriteStatusEvent = Option.of(message); - assertTrue(message.getParticipantInfo().getKafkaCommitOffset() >= committedKafkaOffset); - committedKafkaOffset = message.getParticipantInfo().getKafkaCommitOffset(); + assertTrue(message.getParticipantInfo().getKafkaOffset() >= committedKafkaOffset); + committedKafkaOffset = message.getParticipantInfo().getKafkaOffset(); } } } diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java index 529cd75fde0af..eed79c4861250 100644 --- a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java @@ -18,8 +18,8 @@ package org.apache.hudi.helper; +import org.apache.hudi.connect.ControlMessage; import org.apache.hudi.connect.kafka.KafkaControlAgent; -import org.apache.hudi.connect.transaction.ControlEvent; import org.apache.hudi.connect.transaction.TransactionCoordinator; import org.apache.hudi.connect.transaction.TransactionParticipant; import org.apache.hudi.exception.HoodieException; @@ -70,10 +70,10 @@ public void deregisterTransactionParticipant(TransactionParticipant worker) { } @Override - public void publishMessage(ControlEvent message) { + public void publishMessage(ControlMessage message) { try { - String topic = message.senderPartition().topic(); - if (message.getSenderType().equals(ControlEvent.SenderType.COORDINATOR)) { + String topic = message.getTopicName(); + if (message.getSenderType().equals(ControlMessage.EntityType.COORDINATOR)) { for (TransactionParticipant participant : participants.get(topic)) { participant.processControlEvent(message); } diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestKafkaConnect.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestKafkaConnect.java index 953080921925b..6e947de072211 100644 --- a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestKafkaConnect.java +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestKafkaConnect.java @@ -25,6 +25,7 @@ import org.apache.kafka.connect.sink.SinkRecord; import org.apache.kafka.connect.sink.SinkTaskContext; +import java.io.IOException; import java.util.Arrays; import java.util.Map; import java.util.Set; @@ -60,7 +61,7 @@ public boolean isResumed() { return !isPaused; } - public int putRecordsToParticipant() { + public int putRecordsToParticipant() throws IOException { for (int i = 1; i <= NUM_RECORDS_BATCH; i++) { participant.buffer(getNextKafkaRecord()); } diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java index 3ca64c33d6a18..c8a3ad6ffd92e 100644 --- a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java @@ -148,7 +148,7 @@ private static class AbstractHudiConnectWriterTestWrapper extends AbstractConnec private List writtenRecords; public AbstractHudiConnectWriterTestWrapper(KafkaConnectConfigs connectConfigs, KeyGenerator keyGenerator, SchemaProvider schemaProvider) { - super(connectConfigs, keyGenerator, schemaProvider); + super(connectConfigs, keyGenerator, schemaProvider, "000"); writtenRecords = new ArrayList<>(); } @@ -157,12 +157,12 @@ public List getWrittenRecords() { } @Override - protected void writeHudiRecord(HoodieRecord record) { + protected void writeHudiRecord(HoodieRecord record) { writtenRecords.add(record); } @Override - protected List flushHudiRecords() { + protected List flushRecords() { return null; } } diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java index d1813e1a61814..b0dcf38f4b9d2 100644 --- a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java @@ -88,7 +88,7 @@ public void testSimpleWriteAndFlush() throws Exception { Mockito.verify(mockHoodieJavaWriteClient, times(0)) .bulkInsertPreppedRecords(anyList(), eq(COMMIT_TIME), eq(Option.empty())); - writer.flushHudiRecords(); + writer.flushRecords(); final ArgumentCaptor> actualRecords = ArgumentCaptor.forClass(List.class); Mockito.verify(mockHoodieJavaWriteClient, times(1)) .bulkInsertPreppedRecords(actualRecords.capture(), eq(COMMIT_TIME), eq(Option.empty())); diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java index 0dafba4e53f88..b98417ef2b8f4 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -296,6 +296,9 @@ public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String b SlashEncodedDayPartitionValueExtractor.class.getName()); hiveSyncConfig.useJdbc = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_USE_JDBC().key(), DataSourceWriteOptions.HIVE_USE_JDBC().defaultValue())); + if (props.containsKey(DataSourceWriteOptions.HIVE_SYNC_MODE().key())) { + hiveSyncConfig.syncMode = props.getString(DataSourceWriteOptions.HIVE_SYNC_MODE().key()); + } hiveSyncConfig.autoCreateDatabase = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE().key(), DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE().defaultValue())); hiveSyncConfig.ignoreExceptions = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_IGNORE_EXCEPTIONS().key(), diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 3c7fe7dafaf32..94bcc0d0de85e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -110,6 +110,12 @@ object DataSourceReadOptions { .withDocumentation("The query instant for time travel. Without specified this option," + " we query the latest snapshot.") + val ENABLE_DATA_SKIPPING: ConfigProperty[Boolean] = ConfigProperty + .key("hoodie.enable.data.skipping") + .defaultValue(true) + .sinceVersion("0.10.0") + .withDocumentation("enable data skipping to boost query after doing z-order optimize for current table") + /** @deprecated Use {@link QUERY_TYPE} and its methods instead */ @Deprecated val QUERY_TYPE_OPT_KEY = QUERY_TYPE.key() @@ -386,12 +392,13 @@ object DataSourceWriteOptions { val HIVE_PARTITION_FIELDS: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.hive_sync.partition_fields") .defaultValue("") - .withDocumentation("field in the table to use for determining hive partition columns.") + .withDocumentation("Field in the table to use for determining hive partition columns.") val HIVE_PARTITION_EXTRACTOR_CLASS: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.hive_sync.partition_extractor_class") .defaultValue(classOf[SlashEncodedDayPartitionValueExtractor].getCanonicalName) - .withDocumentation("") + .withDocumentation("Class which implements PartitionValueExtractor to extract the partition values, " + + "default 'SlashEncodedDayPartitionValueExtractor'.") val HIVE_ASSUME_DATE_PARTITION: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.hive_sync.assume_date_partitioning") @@ -401,7 +408,9 @@ object DataSourceWriteOptions { val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.hive_sync.use_pre_apache_input_format") .defaultValue("false") - .withDocumentation("") + .withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. " + + "Use this when you are in the process of migrating from " + + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format") /** @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 */ @Deprecated @@ -440,7 +449,7 @@ object DataSourceWriteOptions { val HIVE_TABLE_SERDE_PROPERTIES: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.hive_sync.serde_properties") .noDefaultValue() - .withDocumentation("") + .withDocumentation("Serde properties to hive table.") val HIVE_SYNC_AS_DATA_SOURCE_TABLE: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.hive_sync.sync_as_datasource") diff --git a/hudi-spark-datasource/hudi-spark/pom.xml b/hudi-spark-datasource/hudi-spark/pom.xml index c4fe1d109f502..11ebeca39e963 100644 --- a/hudi-spark-datasource/hudi-spark/pom.xml +++ b/hudi-spark-datasource/hudi-spark/pom.xml @@ -503,5 +503,33 @@ test + + org.slf4j + slf4j-api + ${slf4j.version} + test + + + + org.apache.hadoop + hadoop-hdfs + tests + test + + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java index c96d216e159fc..b3acf444adb88 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java @@ -34,7 +34,6 @@ import org.apache.spark.sql.api.java.UDF1; import org.apache.spark.sql.functions; import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructType; import java.util.ArrayList; import java.util.Arrays; @@ -53,8 +52,8 @@ public class HoodieDatasetBulkInsertHelper { private static final Logger LOG = LogManager.getLogger(HoodieDatasetBulkInsertHelper.class); - private static final String RECORD_KEY_UDF_FN = "hudi_recordkey_gen_function"; - private static final String PARTITION_PATH_UDF_FN = "hudi_partition_gen_function"; + private static final String RECORD_KEY_UDF_FN = "hudi_recordkey_gen_function_"; + private static final String PARTITION_PATH_UDF_FN = "hudi_partition_gen_function_"; /** * Prepares input hoodie spark dataset for bulk insert. It does the following steps. @@ -79,18 +78,19 @@ public static Dataset prepareHoodieDatasetForBulkInsert(SQLContext sqlConte properties.putAll(config.getProps()); String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key()); BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties); - StructType structTypeForUDF = rows.schema(); - - sqlContext.udf().register(RECORD_KEY_UDF_FN, (UDF1) keyGenerator::getRecordKey, DataTypes.StringType); - sqlContext.udf().register(PARTITION_PATH_UDF_FN, (UDF1) keyGenerator::getPartitionPath, DataTypes.StringType); + String tableName = properties.getString(HoodieWriteConfig.TBL_NAME.key()); + String recordKeyUdfFn = RECORD_KEY_UDF_FN + tableName; + String partitionPathUdfFn = PARTITION_PATH_UDF_FN + tableName; + sqlContext.udf().register(recordKeyUdfFn, (UDF1) keyGenerator::getRecordKey, DataTypes.StringType); + sqlContext.udf().register(partitionPathUdfFn, (UDF1) keyGenerator::getPartitionPath, DataTypes.StringType); final Dataset rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, - callUDF(RECORD_KEY_UDF_FN, org.apache.spark.sql.functions.struct( + callUDF(recordKeyUdfFn, org.apache.spark.sql.functions.struct( JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq()))); final Dataset rowDatasetWithRecordKeysAndPartitionPath = rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, - callUDF(PARTITION_PATH_UDF_FN, + callUDF(partitionPathUdfFn, org.apache.spark.sql.functions.struct( JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq()))); diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java new file mode 100644 index 0000000000000..560b590183cf5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.bootstrap; + +import org.apache.hudi.DataSourceUtils; +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; + +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SparkSession; + +import java.io.IOException; +import java.util.List; + +public abstract class SparkFullBootstrapDataProviderBase extends FullRecordBootstrapDataProvider> { + + private final transient SparkSession sparkSession; + + public SparkFullBootstrapDataProviderBase(TypedProperties props, + HoodieSparkEngineContext context) { + super(props, context); + this.sparkSession = SparkSession.builder().config(context.getJavaSparkContext().getConf()).getOrCreate(); + } + + @Override + public JavaRDD generateInputRecords(String tableName, String sourceBasePath, + List>> partitionPathsWithFiles) { + String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue) + .flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())) + .toArray(String[]::new); + + Dataset inputDataset = sparkSession.read().format(getFormat()).load(filePaths); + try { + KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props); + String structName = tableName + "_record"; + String namespace = "hoodie." + tableName; + RDD genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, + Option.empty()); + return genericRecords.toJavaRDD().map(gr -> { + String orderingVal = HoodieAvroUtils.getNestedFieldValAsString( + gr, props.getString("hoodie.datasource.write.precombine.field"), false); + try { + return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), + props.getString("hoodie.datasource.write.payload.class")); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + }); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + } + + protected abstract String getFormat(); +} \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkOrcBootstrapDataProvider.java similarity index 61% rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java rename to hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkOrcBootstrapDataProvider.java index 055d330479e88..9176d19366625 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkOrcBootstrapDataProvider.java @@ -16,20 +16,23 @@ * limitations under the License. */ -package org.apache.hudi.table.upgrade; +package org.apache.hudi.bootstrap; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.TypedProperties; /** - * Downgrade handle to assist in downgrading hoodie table from version 2 to 1. + * Spark Data frame based bootstrap input provider. */ -public class TwoToOneDowngradeHandler extends BaseTwoToOneDowngradeHandler { +public class SparkOrcBootstrapDataProvider extends SparkFullBootstrapDataProviderBase { + + public SparkOrcBootstrapDataProvider(TypedProperties props, + HoodieSparkEngineContext context) { + super(props, context); + } @Override - HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { - return HoodieSparkTable.create(config, context); + protected String getFormat() { + return "orc"; } -} +} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java index 6051317460c65..e3bdbfe0aa888 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java @@ -18,69 +18,21 @@ package org.apache.hudi.bootstrap; -import org.apache.hudi.DataSourceUtils; -import org.apache.hudi.HoodieSparkUtils; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieFileStatus; -import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.bootstrap.FileStatusUtils; import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.keygen.KeyGenerator; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -import java.io.IOException; -import java.util.List; /** * Spark Data frame based bootstrap input provider. */ -public class SparkParquetBootstrapDataProvider extends FullRecordBootstrapDataProvider> { - - private final transient SparkSession sparkSession; +public class SparkParquetBootstrapDataProvider extends SparkFullBootstrapDataProviderBase { public SparkParquetBootstrapDataProvider(TypedProperties props, HoodieSparkEngineContext context) { super(props, context); - this.sparkSession = SparkSession.builder().config(context.getJavaSparkContext().getConf()).getOrCreate(); } @Override - public JavaRDD generateInputRecords(String tableName, String sourceBasePath, - List>> partitionPathsWithFiles) { - String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue) - .flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())) - .toArray(String[]::new); - - Dataset inputDataset = sparkSession.read().parquet(filePaths); - try { - KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props); - String structName = tableName + "_record"; - String namespace = "hoodie." + tableName; - RDD genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, - Option.empty()); - return genericRecords.toJavaRDD().map(gr -> { - String orderingVal = HoodieAvroUtils.getNestedFieldValAsString( - gr, props.getString("hoodie.datasource.write.precombine.field"), false); - try { - return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), - props.getString("hoodie.datasource.write.payload.class")); - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } - }); - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } + protected String getFormat() { + return "parquet"; } } \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala index 00133abcac730..a9d85af2ee47d 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala @@ -154,14 +154,12 @@ class DefaultSource extends RelationProvider mode: SaveMode, optParams: Map[String, String], df: DataFrame): BaseRelation = { - val parameters = HoodieWriterUtils.parametersWithWriteDefaults(optParams) - val translatedOptions = DataSourceWriteOptions.translateSqlOptions(parameters) val dfWithoutMetaCols = df.drop(HoodieRecord.HOODIE_META_COLUMNS.asScala:_*) - if (translatedOptions(OPERATION.key).equals(BOOTSTRAP_OPERATION_OPT_VAL)) { - HoodieSparkSqlWriter.bootstrap(sqlContext, mode, translatedOptions, dfWithoutMetaCols) + if (optParams.get(OPERATION.key).contains(BOOTSTRAP_OPERATION_OPT_VAL)) { + HoodieSparkSqlWriter.bootstrap(sqlContext, mode, optParams, dfWithoutMetaCols) } else { - HoodieSparkSqlWriter.write(sqlContext, mode, translatedOptions, dfWithoutMetaCols) + HoodieSparkSqlWriter.write(sqlContext, mode, optParams, dfWithoutMetaCols) } new HoodieEmptyRelation(sqlContext, dfWithoutMetaCols.schema) } @@ -170,11 +168,9 @@ class DefaultSource extends RelationProvider optParams: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { - val parameters = HoodieWriterUtils.parametersWithWriteDefaults(optParams) - val translatedOptions = DataSourceWriteOptions.translateSqlOptions(parameters) new HoodieStreamingSink( sqlContext, - translatedOptions, + optParams, partitionColumns, outputMode) } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index b87be995f823a..882636c4697be 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -28,19 +28,20 @@ import org.apache.hudi.common.table.view.{FileSystemViewStorageConfig, HoodieTab import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.internal.Logging -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Column, SparkSession} import org.apache.spark.sql.avro.SchemaConverters -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BoundReference, Expression, InterpretedPredicate} +import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BoundReference, Expression, InterpretedPredicate} import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory} -import org.apache.spark.sql.hudi.HoodieSqlUtils +import org.apache.spark.sql.hudi.{DataSkippingUtils, HoodieSqlUtils} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String - import java.util.Properties + import scala.collection.JavaConverters._ +import scala.collection.JavaConversions._ import scala.collection.mutable /** @@ -56,7 +57,7 @@ import scala.collection.mutable * * 2、If the partition columns size is not equal to the partition path level, but the partition * column size is "1" (e.g. partition column is "dt", but the partition path is "2021/03/10" - * who'es directory level is 3).We can still read it as a partitioned table. We will mapping the + * who's directory level is 3).We can still read it as a partitioned table. We will mapping the * partition path (e.g. 2021/03/10) to the only partition column (e.g. "dt"). * * 3、Else the the partition columns size is not equal to the partition directory level and the @@ -84,6 +85,12 @@ case class HoodieFileIndex( private val specifiedQueryInstant = options.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key) .map(HoodieSqlUtils.formatQueryInstant) + /** + * Get all completeCommits. + */ + lazy val completedCommits = metaClient.getCommitsTimeline + .filterCompletedInstants().getInstants.iterator().toList.map(_.getTimestamp) + /** * Get the schema of the table. */ @@ -106,7 +113,7 @@ case class HoodieFileIndex( nameFieldMap.getOrElse(column, throw new IllegalArgumentException(s"Cannot find column: '" + s"$column' in the schema[${schema.fields.mkString(",")}]"))) new StructType(partitionFields) - } else { // If the partition columns have not stored in hoodie.properites(the table that was + } else { // If the partition columns have not stored in hoodie.properties(the table that was // created earlier), we trait it as a non-partitioned table. logWarning("No partition columns available from hoodie.properties." + " Partition pruning will not work") @@ -125,9 +132,6 @@ case class HoodieFileIndex( properties.put(HoodieMetadataConfig.ENABLE, sqlConf.getConfString(HoodieMetadataConfig.ENABLE.key(), HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS.toString)) - properties.put(HoodieMetadataConfig.VALIDATE_ENABLE, - sqlConf.getConfString(HoodieMetadataConfig.VALIDATE_ENABLE.key(), - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue().toString)) properties.putAll(options.asJava) properties } @@ -150,6 +154,48 @@ case class HoodieFileIndex( override def rootPaths: Seq[Path] = queryPath :: Nil + def enableDataSkipping(): Boolean = { + options.getOrElse(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), + spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean + } + + private def filterFilesByDataSkippingIndex(dataFilters: Seq[Expression]): Set[String] = { + var allFiles: Set[String] = Set.empty + var candidateFiles: Set[String] = Set.empty + val indexPath = metaClient.getZindexPath + val fs = metaClient.getFs + if (fs.exists(new Path(indexPath)) && dataFilters.nonEmpty) { + // try to load latest index table from index path + val candidateIndexTables = fs.listStatus(new Path(indexPath)).filter(_.isDirectory) + .map(_.getPath.getName).filter(f => completedCommits.contains(f)).sortBy(x => x) + if (candidateIndexTables.nonEmpty) { + val dataFrameOpt = try { + Some(spark.read.load(new Path(indexPath, candidateIndexTables.last).toString)) + } catch { + case _: Throwable => + logError("missing index skip data-skipping") + None + } + + if (dataFrameOpt.isDefined) { + val indexSchema = dataFrameOpt.get.schema + val indexFiles = DataSkippingUtils.getIndexFiles(spark.sparkContext.hadoopConfiguration, new Path(indexPath, candidateIndexTables.last).toString) + val indexFilter = dataFilters.map(DataSkippingUtils.createZindexFilter(_, indexSchema)).reduce(And) + logInfo(s"index filter condition: $indexFilter") + dataFrameOpt.get.persist() + if (indexFiles.size <= 4) { + allFiles = DataSkippingUtils.readParquetFile(spark, indexFiles) + } else { + allFiles = dataFrameOpt.get.select("file").collect().map(_.getString(0)).toSet + } + candidateFiles = dataFrameOpt.get.filter(new Column(indexFilter)).select("file").collect().map(_.getString(0)).toSet + dataFrameOpt.get.unpersist() + } + } + } + allFiles -- candidateFiles + } + /** * Invoked by Spark to fetch list of latest base files per partition. * @@ -159,12 +205,29 @@ case class HoodieFileIndex( */ override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + // try to load filterFiles from index + val filterFiles: Set[String] = if (enableDataSkipping()) { + filterFilesByDataSkippingIndex(dataFilters) + } else { + Set.empty + } if (queryAsNonePartitionedTable) { // Read as Non-Partitioned table. - Seq(PartitionDirectory(InternalRow.empty, allFiles)) + val candidateFiles = if (!filterFiles.isEmpty) { + allFiles.filterNot(fileStatus => filterFiles.contains(fileStatus.getPath.getName)) + } else { + allFiles + } + logInfo(s"Total files : ${allFiles.size}," + + s" candidate files after data skipping: ${candidateFiles.size} " + + s" skipping percent ${if (allFiles.length != 0) (allFiles.size - candidateFiles.size) / allFiles.size.toDouble else 0}") + Seq(PartitionDirectory(InternalRow.empty, candidateFiles)) } else { // Prune the partition path by the partition filters val prunedPartitions = prunePartition(cachedAllInputFileSlices.keys.toSeq, partitionFilters) - prunedPartitions.map { partition => + var totalFileSize = 0 + var candidateFileSize = 0 + + val result = prunedPartitions.map { partition => val baseFileStatuses = cachedAllInputFileSlices(partition).map(fileSlice => { if (fileSlice.getBaseFile.isPresent) { fileSlice.getBaseFile.get().getFileStatus @@ -172,9 +235,19 @@ case class HoodieFileIndex( null } }).filterNot(_ == null) - - PartitionDirectory(partition.values, baseFileStatuses) + val candidateFiles = if (!filterFiles.isEmpty) { + baseFileStatuses.filterNot(fileStatus => filterFiles.contains(fileStatus.getPath.getName)) + } else { + baseFileStatuses + } + totalFileSize += baseFileStatuses.size + candidateFileSize += candidateFiles.size + PartitionDirectory(partition.values, candidateFiles) } + logInfo(s"Total files: ${totalFileSize}," + + s" Candidate files after data skipping : ${candidateFileSize} " + + s"skipping percent ${if (allFiles.length != 0) (totalFileSize - candidateFileSize) / totalFileSize.toDouble else 0}") + result } } @@ -256,7 +329,7 @@ case class HoodieFileIndex( .iterator().asScala.toSeq (p._1, fileSlices) }) - cachedFileSize = cachedAllInputFileSlices.values.flatten.map(_.getBaseFile.get().getFileLen).sum + cachedFileSize = cachedAllInputFileSlices.values.flatten.map(fileSliceSize).sum } // If the partition value contains InternalRow.empty, we query it as a non-partitioned table. @@ -266,6 +339,15 @@ case class HoodieFileIndex( s" spend: $flushSpend ms") } + private def fileSliceSize(fileSlice: FileSlice): Long = { + val logFileSize = fileSlice.getLogFiles.iterator().asScala.map(_.getFileSize).filter(_ > 0).sum + if (fileSlice.getBaseFile.isPresent) { + fileSlice.getBaseFile.get().getFileLen + logFileSize + } else { + logFileSize + } + } + override def sizeInBytes: Long = { cachedFileSize } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala index 398377dc64479..44d39facdea67 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala @@ -28,7 +28,7 @@ import org.apache.hudi.exception.HoodieException import org.apache.hudi.hadoop.config.HoodieRealtimeConfig import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS import org.apache.spark.rdd.RDD -import org.apache.spark.sql.avro.{AvroDeserializer, AvroSerializer} +import org.apache.spark.sql.avro.{HoodieAvroSerializer, HooodieAvroDeserializer} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeProjection} import org.apache.spark.sql.execution.datasources.PartitionedFile @@ -119,7 +119,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, tableState.requiredStructSchema .map(f => tableAvroSchema.getField(f.name).pos()).toList private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) - private val deserializer = new AvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) + private val deserializer = HooodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala @@ -135,7 +135,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, } else { val requiredAvroRecord = AvroConversionUtils .buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, requiredFieldPosition, recordBuilder) - recordToLoad = unsafeProjection(deserializer.deserialize(requiredAvroRecord).asInstanceOf[InternalRow]) + recordToLoad = unsafeProjection(deserializer.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) true } } else { @@ -158,7 +158,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, tableState.requiredStructSchema .map(f => tableAvroSchema.getField(f.name).pos()).toList private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) - private val deserializer = new AvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) + private val deserializer = HooodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala @@ -180,7 +180,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, } else { val requiredAvroRecord = AvroConversionUtils .buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, requiredFieldPosition, recordBuilder) - recordToLoad = unsafeProjection(deserializer.deserialize(requiredAvroRecord).asInstanceOf[InternalRow]) + recordToLoad = unsafeProjection(deserializer.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) true } } else { @@ -203,8 +203,8 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, private val requiredFieldPosition = tableState.requiredStructSchema .map(f => tableAvroSchema.getField(f.name).pos()).toList - private val serializer = new AvroSerializer(tableState.tableStructSchema, tableAvroSchema, false) - private val requiredDeserializer = new AvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) + private val serializer = HoodieAvroSerializer(tableState.tableStructSchema, tableAvroSchema, false) + private val requiredDeserializer = HooodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords @@ -236,7 +236,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, recordBuilder ) recordToLoad = unsafeProjection(requiredDeserializer - .deserialize(requiredAvroRecord).asInstanceOf[InternalRow]) + .deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) true } } else { @@ -264,7 +264,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, recordBuilder ) recordToLoad = unsafeProjection(requiredDeserializer - .deserialize(requiredAvroRecord).asInstanceOf[InternalRow]) + .deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) true } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 83b37acd5dbef..ddbd7fc06a95b 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -17,13 +17,13 @@ package org.apache.hudi - import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.DataSourceOptionsHelper.{allAlternatives, translateConfigurations} import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.client.{HoodieWriteResult, SparkRDDWriteClient} import org.apache.hudi.common.config.{HoodieConfig, HoodieMetadataConfig, TypedProperties} @@ -31,13 +31,13 @@ import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType, WriteOperationType} import org.apache.hudi.common.table.timeline.HoodieActiveTimeline import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils} +import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils, StringUtils} import org.apache.hudi.config.HoodieBootstrapConfig.{BASE_PATH, INDEX_CLASS_NAME} import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig} import org.apache.hudi.exception.HoodieException import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRowsFactory, NonSortPartitionerWithRows} import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} -import org.apache.hudi.index.SparkHoodieIndex +import org.apache.hudi.index.SparkHoodieIndexFactory import org.apache.hudi.internal.DataSourceInternalWriterHelper import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import org.apache.hudi.sync.common.AbstractSyncTool @@ -49,10 +49,12 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession} import org.apache.spark.{SPARK_VERSION, SparkContext} - import java.util import java.util.Properties + import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.collection.mutable.StringBuilder import scala.collection.mutable.ListBuffer object HoodieSparkSqlWriter { @@ -64,7 +66,7 @@ object HoodieSparkSqlWriter { def write(sqlContext: SQLContext, mode: SaveMode, - parameters: Map[String, String], + optParams: Map[String, String], df: DataFrame, hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty, hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty, @@ -74,16 +76,23 @@ object HoodieSparkSqlWriter { : (Boolean, common.util.Option[String], common.util.Option[String], common.util.Option[String], SparkRDDWriteClient[HoodieRecordPayload[Nothing]], HoodieTableConfig) = { + assert(optParams.get("path").exists(!StringUtils.isNullOrEmpty(_)), "'path' must be set") + val path = optParams("path") + val basePath = new Path(path) val sparkContext = sqlContext.sparkContext - val path = parameters.get("path") - val hoodieConfig = HoodieWriterUtils.convertMapToHoodieConfig(parameters) - val tblNameOp = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME, s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.") + val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) + tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) + var tableConfig = getHoodieTableConfig(sparkContext, path, hoodieTableConfigOpt) + validateTableConfig(sqlContext.sparkSession, optParams, tableConfig) + + val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig) + val tblName = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME, + s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.").trim + assert(!StringUtils.isNullOrEmpty(hoodieConfig.getString(HoodieWriteConfig.TBL_NAME)), + s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.") + asyncCompactionTriggerFnDefined = asyncCompactionTriggerFn.isDefined asyncClusteringTriggerFnDefined = asyncClusteringTriggerFn.isDefined - if (path.isEmpty) { - throw new HoodieException(s"'path' must be set.") - } - val tblName = tblNameOp.trim sparkContext.getConf.getOption("spark.serializer") match { case Some(ser) if ser.equals("org.apache.spark.serializer.KryoSerializer") => case _ => throw new HoodieException("hoodie only support org.apache.spark.serializer.KryoSerializer as spark.serializer") @@ -104,12 +113,8 @@ object HoodieSparkSqlWriter { } val jsc = new JavaSparkContext(sparkContext) - val basePath = new Path(path.get) val instantTime = HoodieActiveTimeline.createNewInstantTime() - val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) - tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) - var tableConfig = getHoodieTableConfig(sparkContext, path.get, hoodieTableConfigOpt) - val keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(toProperties(parameters)) + val keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(hoodieConfig.getProps)) if (mode == SaveMode.Ignore && tableExists) { log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") @@ -123,7 +128,7 @@ object HoodieSparkSqlWriter { val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.BASE_FILE_FORMAT) val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.ARCHIVELOG_FOLDER) val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD) - val populateMetaFields = parameters.getOrElse(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()).toBoolean + val populateMetaFields = hoodieConfig.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS) val tableMetaClient = HoodieTableMetaClient.withPropertyBuilder() .setTableType(tableType) @@ -137,7 +142,9 @@ object HoodieSparkSqlWriter { .setPopulateMetaFields(populateMetaFields) .setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD)) .setKeyGeneratorClassProp(hoodieConfig.getString(KEYGENERATOR_CLASS_NAME)) - .initTable(sparkContext.hadoopConfiguration, path.get) + .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) + .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) + .initTable(sparkContext.hadoopConfiguration, path) tableConfig = tableMetaClient.getTableConfig } @@ -168,7 +175,7 @@ object HoodieSparkSqlWriter { // Create a HoodieWriteClient & issue the delete. val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, - null, path.get, tblName, + null, path, tblName, mapAsJavaMap(parameters - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key))) .asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] @@ -199,7 +206,7 @@ object HoodieSparkSqlWriter { } // Create a HoodieWriteClient & issue the delete. val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, - null, path.get, tblName, + null, path, tblName, mapAsJavaMap(parameters - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key))) .asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] // Issue delete partitions @@ -243,7 +250,7 @@ object HoodieSparkSqlWriter { val writeSchema = if (dropPartitionColumns) generateSchemaWithoutPartitionColumns(partitionColumns, schema) else schema // Create a HoodieWriteClient & issue the write. - val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writeSchema.toString, path.get, + val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writeSchema.toString, path, tblName, mapAsJavaMap(parameters - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key) )).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] @@ -325,14 +332,21 @@ object HoodieSparkSqlWriter { def bootstrap(sqlContext: SQLContext, mode: SaveMode, - parameters: Map[String, String], + optParams: Map[String, String], df: DataFrame, hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty, hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty): Boolean = { + assert(optParams.get("path").exists(!StringUtils.isNullOrEmpty(_)), "'path' must be set") + val path = optParams("path") + val basePath = new Path(path) val sparkContext = sqlContext.sparkContext - val path = parameters.getOrElse("path", throw new HoodieException("'path' must be set.")) - val hoodieConfig = HoodieWriterUtils.convertMapToHoodieConfig(parameters) + val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) + tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) + var tableConfig = getHoodieTableConfig(sparkContext, path, hoodieTableConfigOpt) + validateTableConfig(sqlContext.sparkSession, optParams, tableConfig) + + val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig) val tableName = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME, s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.") val tableType = hoodieConfig.getStringOrDefault(TABLE_TYPE) val bootstrapBasePath = hoodieConfig.getStringOrThrow(BASE_PATH, @@ -348,10 +362,6 @@ object HoodieSparkSqlWriter { schema = HoodieAvroUtils.getNullSchema.toString } - val basePath = new Path(path) - val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) - tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) - val tableConfig = getHoodieTableConfig(sparkContext, path, hoodieTableConfigOpt) // Handle various save modes if (mode == SaveMode.Ignore && tableExists) { @@ -367,6 +377,7 @@ object HoodieSparkSqlWriter { val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD) val keyGenProp = hoodieConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME) val populateMetaFields = parameters.getOrElse(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()).toBoolean + val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.BASE_FILE_FORMAT) HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.valueOf(tableType)) @@ -376,10 +387,13 @@ object HoodieSparkSqlWriter { .setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_NAME)) .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD, null)) .setBootstrapIndexClass(bootstrapIndexClass) + .setBaseFileFormat(baseFileFormat) .setBootstrapBasePath(bootstrapBasePath) .setPartitionFields(partitionColumns) .setPopulateMetaFields(populateMetaFields) .setKeyGeneratorClassProp(keyGenProp) + .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) + .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) .initTable(sparkContext.hadoopConfiguration, path) } @@ -400,7 +414,7 @@ object HoodieSparkSqlWriter { df: DataFrame, tblName: String, basePath: Path, - path: Option[String], + path: String, instantTime: String, partitionColumns: String): (Boolean, common.util.Option[String]) = { val sparkContext = sqlContext.sparkContext @@ -423,7 +437,7 @@ object HoodieSparkSqlWriter { throw new HoodieException("Dropping duplicates with bulk_insert in row writer path is not supported yet") } val params = parameters.updated(HoodieWriteConfig.AVRO_SCHEMA_STRING.key, schema.toString) - val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path.get, tblName, mapAsJavaMap(params)) + val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path, tblName, mapAsJavaMap(params)) val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (populateMetaFields) { val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig) if (userDefinedBulkInsertPartitionerOpt.isPresent) { @@ -439,7 +453,7 @@ object HoodieSparkSqlWriter { val arePartitionRecordsSorted = bulkInsertPartitionerRows.arePartitionRecordsSorted(); parameters.updated(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, arePartitionRecordsSorted.toString) val isGlobalIndex = if (populateMetaFields) { - SparkHoodieIndex.isGlobalIndex(writeConfig) + SparkHoodieIndexFactory.isGlobalIndex(writeConfig) } else { false } @@ -539,7 +553,6 @@ object HoodieSparkSqlWriter { hiveSyncConfig.partitionValueExtractorClass = hoodieConfig.getString(HIVE_PARTITION_EXTRACTOR_CLASS) hiveSyncConfig.useJdbc = hoodieConfig.getBoolean(HIVE_USE_JDBC) hiveSyncConfig.useFileListingFromMetadata = hoodieConfig.getBoolean(HoodieMetadataConfig.ENABLE) - hiveSyncConfig.verifyMetadataFileListing = hoodieConfig.getBoolean(HoodieMetadataConfig.VALIDATE_ENABLE) hiveSyncConfig.ignoreExceptions = hoodieConfig.getStringOrDefault(HIVE_IGNORE_EXCEPTIONS).toBoolean hiveSyncConfig.supportTimestamp = hoodieConfig.getStringOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE).toBoolean hiveSyncConfig.autoCreateDatabase = hoodieConfig.getStringOrDefault(HIVE_AUTO_CREATE_DATABASE).toBoolean @@ -699,4 +712,49 @@ object HoodieSparkSqlWriter { null } } + + private def validateTableConfig(spark: SparkSession, params: Map[String, String], + tableConfig: HoodieTableConfig): Unit = { + val resolver = spark.sessionState.conf.resolver + val diffConfigs = StringBuilder.newBuilder + params.foreach { case (key, value) => + val existingValue = getStringFromTableConfigWithAlternatives(tableConfig, key) + if (null != existingValue && !resolver(existingValue, value)) { + diffConfigs.append(s"$key:\t$value\t${tableConfig.getString(key)}\n") + } + } + if (diffConfigs.nonEmpty) { + diffConfigs.insert(0, "\nConfig conflict(key\tcurrent value\texisting value):\n") + throw new HoodieException(diffConfigs.toString.trim) + } + } + + private def mergeParamsAndGetHoodieConfig(optParams: Map[String, String], + tableConfig: HoodieTableConfig): (Map[String, String], HoodieConfig) = { + val mergedParams = mutable.Map.empty ++ + DataSourceWriteOptions.translateSqlOptions(HoodieWriterUtils.parametersWithWriteDefaults(optParams)) + if (!mergedParams.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) + && mergedParams.contains(KEYGENERATOR_CLASS_NAME.key)) { + mergedParams(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = mergedParams(KEYGENERATOR_CLASS_NAME.key) + } + if (null != tableConfig) { + tableConfig.getProps.foreach { case (key, value) => + mergedParams(key) = value + } + } + val params = mergedParams.toMap + (params, HoodieWriterUtils.convertMapToHoodieConfig(params)) + } + + private def getStringFromTableConfigWithAlternatives(tableConfig: HoodieTableConfig, key: String): String = { + if (null == tableConfig) { + null + } else { + if (allAlternatives.contains(key)) { + tableConfig.getString(allAlternatives(key)) + } else { + tableConfig.getString(key) + } + } + } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala index b1f8eb57ba59b..6e736d225a523 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala @@ -48,9 +48,12 @@ class HoodieStreamingSink(sqlContext: SQLContext, private val log = LogManager.getLogger(classOf[HoodieStreamingSink]) - private val retryCnt = options(DataSourceWriteOptions.STREAMING_RETRY_CNT.key).toInt - private val retryIntervalMs = options(DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS.key).toLong - private val ignoreFailedBatch = options(DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH.key).toBoolean + private val retryCnt = options.getOrDefault(DataSourceWriteOptions.STREAMING_RETRY_CNT.key, + DataSourceWriteOptions.STREAMING_RETRY_CNT.defaultValue).toInt + private val retryIntervalMs = options.getOrDefault(DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS.key, + DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS.defaultValue).toLong + private val ignoreFailedBatch = options.getOrDefault(DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH.key, + DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH.defaultValue).toBoolean private var isAsyncCompactorServiceShutdownAbnormally = false private var isAsyncClusteringServiceShutdownAbnormally = false diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala index 51b632b50370b..0e3ede1fe3ebc 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala @@ -18,13 +18,14 @@ package org.apache.hudi import org.apache.hudi.DataSourceWriteOptions._ -import org.apache.hudi.common.config.HoodieMetadataConfig.{ENABLE, VALIDATE_ENABLE} +import org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE import org.apache.hudi.common.config.{HoodieConfig, TypedProperties} -import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import java.util.Properties import scala.collection.JavaConversions.mapAsJavaMap import scala.collection.JavaConverters.{mapAsScalaMapConverter, _} +import scala.collection.JavaConverters.mapAsScalaMapConverter +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory /** * WriterUtils to assist in write path in Datasource and tests. @@ -50,7 +51,6 @@ object HoodieWriterUtils { PARTITIONPATH_FIELD.key -> PARTITIONPATH_FIELD.defaultValue, KEYGENERATOR_CLASS_NAME.key -> DEFAULT_KEYGENERATOR_CLASS_OPT_VAL, ENABLE.key -> ENABLE.defaultValue.toString, - VALIDATE_ENABLE.key -> VALIDATE_ENABLE.defaultValue.toString, COMMIT_METADATA_KEYPREFIX.key -> COMMIT_METADATA_KEYPREFIX.defaultValue, INSERT_DROP_DUPS.key -> INSERT_DROP_DUPS.defaultValue, STREAMING_RETRY_CNT.key -> STREAMING_RETRY_CNT.defaultValue, @@ -92,10 +92,9 @@ object HoodieWriterUtils { * @return */ def getPartitionColumns(parameters: Map[String, String]): String = { - val props = new TypedProperties() + val props = new Properties() props.putAll(parameters.asJava) - val keyGen = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props) - HoodieSparkUtils.getPartitionColumns(keyGen, props) + HoodieSparkUtils.getPartitionColumns(props) } def convertMapToHoodieConfig(parameters: Map[String, String]): HoodieConfig = { diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala index f1bc847514df4..b4a9800d994b9 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala @@ -22,8 +22,10 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.exception.HoodieException import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.listAffectedFilesForCommits +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getCommitMetadata +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getWritePartitionPaths import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes -import org.apache.hadoop.fs.{FileStatus, GlobPattern, Path} +import org.apache.hadoop.fs.{GlobPattern, Path} import org.apache.hadoop.mapred.JobConf import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD @@ -35,7 +37,6 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext} import scala.collection.JavaConversions._ -import scala.collection.mutable.ListBuffer /** * Experimental. @@ -162,16 +163,12 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext, } def buildFileIndex(): List[HoodieMergeOnReadFileSplit] = { - val partitionsWithFileStatus = listAffectedFilesForCommits(new Path(metaClient.getBasePath), - commitsToReturn, commitsTimelineToReturn) - val affectedFileStatus = new ListBuffer[FileStatus] - partitionsWithFileStatus.iterator.foreach(p => - p._2.iterator.foreach(status => affectedFileStatus += status._2)) - val fsView = new HoodieTableFileSystemView(metaClient, - commitsTimelineToReturn, affectedFileStatus.toArray) + val metadataList = commitsToReturn.map(instant => getCommitMetadata(instant, commitsTimelineToReturn)) + val affectedFileStatus = listAffectedFilesForCommits(new Path(metaClient.getBasePath), metadataList) + val fsView = new HoodieTableFileSystemView(metaClient, commitsTimelineToReturn, affectedFileStatus) // Iterate partitions to create splits - val fileGroup = partitionsWithFileStatus.keySet().flatMap(partitionPath => + val fileGroup = getWritePartitionPaths(metadataList).flatMap(partitionPath => fsView.getAllFileGroups(partitionPath).iterator() ).toList val latestCommit = fsView.getLastInstant.get().getTimestamp diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala new file mode 100644 index 0000000000000..b464c2dc5d611 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Schema +import org.apache.spark.sql.types.DataType + +/** + * As AvroSerializer cannot be access out of the spark.sql.avro package since spark 3.1, we define + * this class to be accessed by other class. + */ +case class HoodieAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean) + extends AvroSerializer(rootCatalystType, rootAvroType, nullable) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/avro/HooodieAvroDeserializer.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/avro/HooodieAvroDeserializer.scala new file mode 100644 index 0000000000000..ba911a7b3075c --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/avro/HooodieAvroDeserializer.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Schema +import org.apache.spark.sql.types.DataType + +/** + * This is to be compatible with the type returned by Spark 3.1 + * and other spark versions for AvroDeserializer + */ +case class HooodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) + extends AvroDeserializer(rootAvroType, rootCatalystType) { + + def deserializeData(data: Any): Any = { + super.deserialize(data) match { + case Some(r) => r // spark 3.1 return type is Option, we fetch the data. + case o => o // for other spark version, return the data directly. + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala new file mode 100644 index 0000000000000..45a7aec142d5a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, EqualNullSafe, EqualTo, Expression, ExtractValue, GetStructField, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Literal, Not, Or, StartsWith} +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.unsafe.types.UTF8String + +import scala.collection.JavaConverters._ + +object DataSkippingUtils { + + /** + * create z_index filter and push those filters to index table to filter all candidate scan files. + * @param condition origin filter from query. + * @param indexSchema schema from index table. + * @return filters for index table. + */ + def createZindexFilter(condition: Expression, indexSchema: StructType): Expression = { + def buildExpressionInternal(colName: Seq[String], statisticValue: String): Expression = { + val appendColName = UnresolvedAttribute(colName).name + statisticValue + col(appendColName).expr + } + + def reWriteCondition(colName: Seq[String], conditionExpress: Expression): Expression = { + val appendColName = UnresolvedAttribute(colName).name + "_minValue" + if (indexSchema.exists(p => p.name == appendColName)) { + conditionExpress + } else { + Literal.TrueLiteral + } + } + + val minValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_minValue") + val maxValue = (colName: Seq[String]) => buildExpressionInternal(colName, "_maxValue") + val num_nulls = (colName: Seq[String]) => buildExpressionInternal(colName, "_num_nulls") + + condition match { + // query filter "colA = b" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table + case EqualTo(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) + // query filter "b = colA" convert it to "colA_minValue <= b and colA_maxValue >= b" for index table + case EqualTo(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, And(LessThanOrEqual(minValue(colName), value), GreaterThanOrEqual(maxValue(colName), value))) + // query filter "colA = null" convert it to "colA_num_nulls = null" for index table + case equalNullSafe @ EqualNullSafe(_: AttributeReference, _ @ Literal(null, _)) => + val colName = getTargetColNameParts(equalNullSafe.left) + reWriteCondition(colName, EqualTo(num_nulls(colName), equalNullSafe.right)) + // query filter "colA < b" convert it to "colA_minValue < b" for index table + case LessThan(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName,LessThan(minValue(colName), value)) + // query filter "b < colA" convert it to "colA_maxValue > b" for index table + case LessThan(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(maxValue(colName), value)) + // query filter "colA > b" convert it to "colA_maxValue > b" for index table + case GreaterThan(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(maxValue(colName), value)) + // query filter "b > colA" convert it to "colA_minValue < b" for index table + case GreaterThan(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThan(minValue(colName), value)) + // query filter "colA <= b" convert it to "colA_minValue <= b" for index table + case LessThanOrEqual(attribute: AttributeReference, value: Literal) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) + // query filter "b <= colA" convert it to "colA_maxValue >= b" for index table + case LessThanOrEqual(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThanOrEqual(maxValue(colName), value)) + // query filter "colA >= b" convert it to "colA_maxValue >= b" for index table + case GreaterThanOrEqual(attribute: AttributeReference, right: Literal) => + val colName = getTargetColNameParts(attribute) + GreaterThanOrEqual(maxValue(colName), right) + // query filter "b >= colA" convert it to "colA_minValue <= b" for index table + case GreaterThanOrEqual(value: Literal, attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, LessThanOrEqual(minValue(colName), value)) + // query filter "colA is null" convert it to "colA_num_nulls > 0" for index table + case IsNull(attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, GreaterThan(num_nulls(colName), Literal(0))) + // query filter "colA is not null" convert it to "colA_num_nulls = 0" for index table + case IsNotNull(attribute: AttributeReference) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, EqualTo(num_nulls(colName), Literal(0))) + // query filter "colA in (a,b)" convert it to " (colA_minValue <= a and colA_maxValue >= a) or (colA_minValue <= b and colA_maxValue >= b) " for index table + case In(attribute: AttributeReference, list: Seq[Literal]) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, list.map { lit => + And(LessThanOrEqual(minValue(colName), lit), GreaterThanOrEqual(maxValue(colName), lit)) + }.reduce(Or)) + // query filter "colA like xxx" convert it to " (colA_minValue <= xxx and colA_maxValue >= xxx) or (colA_min start with xxx or colA_max start with xxx) " for index table + case StartsWith(attribute, v @ Literal(_: UTF8String, _)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Or(And(LessThanOrEqual(minValue(colName), v), GreaterThanOrEqual(maxValue(colName), v)) , + Or(StartsWith(minValue(colName), v), StartsWith(maxValue(colName), v)))) + // query filter "colA not in (a, b)" convert it to " (not( colA_minValue = a and colA_maxValue = a)) and (not( colA_minValue = b and colA_maxValue = b)) " for index table + case Not(In(attribute: AttributeReference, list: Seq[Literal])) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, list.map { lit => + Not(And(EqualTo(minValue(colName), lit), EqualTo(maxValue(colName), lit))) + }.reduce(And)) + // query filter "colA != b" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table + case Not(EqualTo(attribute: AttributeReference, value: Literal)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) + // query filter "b != colA" convert it to "not ( colA_minValue = b and colA_maxValue = b )" for index table + case Not(EqualTo(value: Literal, attribute: AttributeReference)) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value)))) + // query filter "colA not like xxxx" convert it to "not ( colA_minValue startWith xxx and colA_maxValue startWith xxx)" for index table + case Not(StartsWith(attribute, value @ Literal(_: UTF8String, _))) => + val colName = getTargetColNameParts(attribute) + reWriteCondition(colName, Not(And(StartsWith(minValue(colName), value), StartsWith(maxValue(colName), value)))) + case or: Or => + val resLeft = createZindexFilter(or.left, indexSchema) + val resRight = createZindexFilter(or.right, indexSchema) + Or(resLeft, resRight) + + case and: And => + val resLeft = createZindexFilter(and.left, indexSchema) + val resRight = createZindexFilter(and.right, indexSchema) + And(resLeft, resRight) + + case expr: Expression => + Literal.TrueLiteral + } + } + + /** + * Extracts name from a resolved expression referring to a nested or non-nested column. + */ + def getTargetColNameParts(resolvedTargetCol: Expression): Seq[String] = { + resolvedTargetCol match { + case attr: Attribute => Seq(attr.name) + + case Alias(c, _) => getTargetColNameParts(c) + + case GetStructField(c, _, Some(name)) => getTargetColNameParts(c) :+ name + + case ex: ExtractValue => + throw new AnalysisException(s"convert reference to name failed, Updating nested fields is only supported for StructType: ${ex}.") + + case other => + throw new AnalysisException(s"convert reference to name failed, Found unsupported expression ${other}") + } + } + + def getIndexFiles(conf: Configuration, indexPath: String): Seq[FileStatus] = { + val basePath = new Path(indexPath) + basePath.getFileSystem(conf) + .listStatus(basePath).filterNot(f => f.getPath.getName.endsWith(".parquet")) + } + + /** + * read parquet files concurrently by local. + * this method is mush faster than spark + */ + def readParquetFile(spark: SparkSession, indexFiles: Seq[FileStatus], filters: Seq[Filter] = Nil, schemaOpts: Option[StructType] = None): Set[String] = { + val hadoopConf = spark.sparkContext.hadoopConfiguration + val partitionedFiles = indexFiles.map(f => PartitionedFile(InternalRow.empty, f.getPath.toString, 0, f.getLen)) + + val requiredSchema = new StructType().add("file", StringType, true) + val schema = schemaOpts.getOrElse(requiredSchema) + val parquetReader = new ParquetFileFormat().buildReaderWithPartitionValues(spark + , schema , StructType(Nil), requiredSchema, filters, Map.empty, hadoopConf) + val results = new Array[Iterator[String]](partitionedFiles.size) + partitionedFiles.zipWithIndex.par.foreach { case (pf, index) => + val fileIterator = parquetReader(pf).asInstanceOf[Iterator[Any]] + val rows = fileIterator.flatMap(_ match { + case r: InternalRow => Seq(r) + case b: ColumnarBatch => b.rowIterator().asScala + }).map(r => r.getString(0)) + results(index) = rows + } + results.flatMap(f => f).toSet + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala index 25d3026f1210e..963035cb638d3 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala @@ -120,8 +120,13 @@ object HoodieOptionConfig { */ def mappingSqlOptionToTableConfig(options: Map[String, String]): Map[String, String] = { defaultTableConfig ++ - options.filterKeys(k => keyTableConfigMapping.contains(k)) - .map(kv => keyTableConfigMapping(kv._1) -> valueMapping.getOrElse(kv._2, kv._2)) + options.map { case (k, v) => + if (keyTableConfigMapping.contains(k)) { + keyTableConfigMapping(k) -> valueMapping.getOrElse(v, v) + } else { + k -> v + } + } } /** diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala index c1130d2515f30..cf9c49ef02a9c 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala @@ -19,14 +19,19 @@ package org.apache.spark.sql.hudi import scala.collection.JavaConverters._ import java.net.URI -import java.util.{Date, Locale} +import java.util.{Date, Locale, Properties} + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.table.timeline.HoodieActiveTimeline import org.apache.spark.SPARK_VERSION +import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.{Column, DataFrame, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier @@ -36,6 +41,7 @@ import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expressi import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MergeIntoTable, SubqueryAlias} import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.types.{DataType, NullType, StringType, StructField, StructType} import java.text.SimpleDateFormat @@ -80,6 +86,54 @@ object HoodieSqlUtils extends SparkAdapterSupport { .asInstanceOf[StructType]).map(removeMetaFields) } + def getAllPartitionPaths(spark: SparkSession, table: CatalogTable): Seq[String] = { + val sparkEngine = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) + val metadataConfig = { + val properties = new Properties() + properties.putAll((spark.sessionState.conf.getAllConfs ++ table.storage.properties).asJava) + HoodieMetadataConfig.newBuilder.fromProperties(properties).build() + } + FSUtils.getAllPartitionPaths(sparkEngine, metadataConfig, getTableLocation(table, spark)).asScala + } + + /** + * This method is used to compatible with the old non-hive-styled partition table. + * By default we enable the "hoodie.datasource.write.hive_style_partitioning" + * when writing data to hudi table by spark sql by default. + * If the exist table is a non-hive-styled partitioned table, we should + * disable the "hoodie.datasource.write.hive_style_partitioning" when + * merge or update the table. Or else, we will get an incorrect merge result + * as the partition path mismatch. + */ + def isHiveStyledPartitioning(partitionPaths: Seq[String], table: CatalogTable): Boolean = { + if (table.partitionColumnNames.nonEmpty) { + val isHiveStylePartitionPath = (path: String) => { + val fragments = path.split("/") + if (fragments.size != table.partitionColumnNames.size) { + false + } else { + fragments.zip(table.partitionColumnNames).forall { + case (pathFragment, partitionColumn) => pathFragment.startsWith(s"$partitionColumn=") + } + } + } + partitionPaths.forall(isHiveStylePartitionPath) + } else { + true + } + } + + /** + * Determine whether URL encoding is enabled + */ + def isUrlEncodeEnabled(partitionPaths: Seq[String], table: CatalogTable): Boolean = { + if (table.partitionColumnNames.nonEmpty) { + partitionPaths.forall(partitionPath => partitionPath.split("/").length == table.partitionColumnNames.size) + } else { + false + } + } + private def tripAlias(plan: LogicalPlan): LogicalPlan = { plan match { case SubqueryAlias(_, relation: LogicalPlan) => @@ -239,12 +293,12 @@ object HoodieSqlUtils extends SparkAdapterSupport { */ def formatQueryInstant(queryInstant: String): String = { if (queryInstant.length == 19) { // for yyyy-MM-dd HH:mm:ss - HoodieActiveTimeline.COMMIT_FORMATTER.format(defaultDateTimeFormat.parse(queryInstant)) + HoodieActiveTimeline.formatInstantTime(defaultDateTimeFormat.parse(queryInstant)) } else if (queryInstant.length == 14) { // for yyyyMMddHHmmss - HoodieActiveTimeline.COMMIT_FORMATTER.parse(queryInstant) // validate the format + HoodieActiveTimeline.parseInstantTime(queryInstant) // validate the format queryInstant } else if (queryInstant.length == 10) { // for yyyy-MM-dd - HoodieActiveTimeline.COMMIT_FORMATTER.format(defaultDateFormat.parse(queryInstant)) + HoodieActiveTimeline.formatInstantTime(defaultDateFormat.parse(queryInstant)) } else { throw new IllegalArgumentException(s"Unsupported query instant time format: $queryInstant," + s"Supported time format are: 'yyyy-MM-dd: HH:mm:ss' or 'yyyy-MM-dd' or 'yyyyMMddHHmmss'") diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala index a588eb604d4d7..87cbb8a7f0306 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala @@ -29,13 +29,13 @@ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, Literal, NamedExpression} import org.apache.spark.sql.catalyst.plans.Inner -import org.apache.spark.sql.catalyst.plans.logical.{Assignment, CompactionPath, CompactionShowOnPath, CompactionShowOnTable, CompactionTable, DeleteAction, DeleteFromTable, InsertAction, LogicalPlan, MergeIntoTable, Project, UpdateAction, UpdateTable} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.command.{AlterTableAddColumnsCommand, AlterTableChangeColumnCommand, AlterTableRenameCommand, CreateDataSourceTableCommand, TruncateTableCommand} +import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation} import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlUtils} import org.apache.spark.sql.hudi.HoodieSqlUtils._ -import org.apache.spark.sql.hudi.command.{AlterHoodieTableAddColumnsCommand, AlterHoodieTableChangeColumnCommand, AlterHoodieTableRenameCommand, CompactionHoodiePathCommand, CompactionHoodieTableCommand, CompactionShowHoodiePathCommand, CompactionShowHoodieTableCommand, CreateHoodieTableAsSelectCommand, CreateHoodieTableCommand, DeleteHoodieTableCommand, InsertIntoHoodieTableCommand, MergeIntoHoodieTableCommand, TruncateHoodieTableCommand, UpdateHoodieTableCommand} +import org.apache.spark.sql.hudi.command._ import org.apache.spark.sql.types.StringType object HoodieAnalysis { @@ -125,6 +125,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi case mergeInto @ MergeIntoTable(target, source, mergeCondition, matchedActions, notMatchedActions) if isHoodieTable(target, sparkSession) && target.resolved => + val resolver = sparkSession.sessionState.conf.resolver val resolvedSource = analyzer.execute(source) def isInsertOrUpdateStar(assignments: Seq[Assignment]): Boolean = { if (assignments.isEmpty) { @@ -161,23 +162,21 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi val resolvedCondition = condition.map(resolveExpressionFrom(resolvedSource)(_)) val resolvedAssignments = if (isInsertOrUpdateStar(assignments)) { // assignments is empty means insert * or update set * - val resolvedSourceOutputWithoutMetaFields = resolvedSource.output.filter(attr => !HoodieSqlUtils.isMetaField(attr.name)) - val targetOutputWithoutMetaFields = target.output.filter(attr => !HoodieSqlUtils.isMetaField(attr.name)) - val resolvedSourceColumnNamesWithoutMetaFields = resolvedSourceOutputWithoutMetaFields.map(_.name) - val targetColumnNamesWithoutMetaFields = targetOutputWithoutMetaFields.map(_.name) + val resolvedSourceOutput = resolvedSource.output.filter(attr => !HoodieSqlUtils.isMetaField(attr.name)) + val targetOutput = target.output.filter(attr => !HoodieSqlUtils.isMetaField(attr.name)) + val resolvedSourceColumnNames = resolvedSourceOutput.map(_.name) - if(targetColumnNamesWithoutMetaFields.toSet.subsetOf(resolvedSourceColumnNamesWithoutMetaFields.toSet)){ + if(targetOutput.filter(attr => resolvedSourceColumnNames.exists(resolver(_, attr.name))).equals(targetOutput)){ //If sourceTable's columns contains all targetTable's columns, //We fill assign all the source fields to the target fields by column name matching. - val sourceColNameAttrMap = resolvedSourceOutputWithoutMetaFields.map(attr => (attr.name, attr)).toMap - targetOutputWithoutMetaFields.map(targetAttr => { - val sourceAttr = sourceColNameAttrMap(targetAttr.name) + targetOutput.map(targetAttr => { + val sourceAttr = resolvedSourceOutput.find(f => resolver(f.name, targetAttr.name)).get Assignment(targetAttr, sourceAttr) }) } else { // We fill assign all the source fields to the target fields by order. - targetOutputWithoutMetaFields - .zip(resolvedSourceOutputWithoutMetaFields) + targetOutput + .zip(resolvedSourceOutput) .map { case (targetAttr, sourceAttr) => Assignment(targetAttr, sourceAttr) } } } else { @@ -214,8 +213,9 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi }.toMap // Validate if there are incorrect target attributes. + val targetColumnNames = removeMetaFields(target.output).map(_.name) val unKnowTargets = target2Values.keys - .filterNot(removeMetaFields(target.output).map(_.name).contains(_)) + .filterNot(name => targetColumnNames.exists(resolver(_, name))) if (unKnowTargets.nonEmpty) { throw new AnalysisException(s"Cannot find target attributes: ${unKnowTargets.mkString(",")}.") } @@ -224,19 +224,20 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi // e.g. If the update action missing 'id' attribute, we fill a "id = target.id" to the update action. val newAssignments = removeMetaFields(target.output) .map(attr => { + val valueOption = target2Values.find(f => resolver(f._1, attr.name)) // TODO support partial update for MOR. - if (!target2Values.contains(attr.name) && targetTableType == MOR_TABLE_TYPE_OPT_VAL) { + if (valueOption.isEmpty && targetTableType == MOR_TABLE_TYPE_OPT_VAL) { throw new AnalysisException(s"Missing specify the value for target field: '${attr.name}' in merge into update action" + s" for MOR table. Currently we cannot support partial update for MOR," + s" please complete all the target fields just like '...update set id = s0.id, name = s0.name ....'") } if (preCombineField.isDefined && preCombineField.get.equalsIgnoreCase(attr.name) - && !target2Values.contains(attr.name)) { + && valueOption.isEmpty) { throw new AnalysisException(s"Missing specify value for the preCombineField:" + s" ${preCombineField.get} in merge-into update action. You should add" + s" '... update set ${preCombineField.get} = xx....' to the when-matched clause.") } - Assignment(attr, target2Values.getOrElse(attr.name, attr)) + Assignment(attr, if (valueOption.isEmpty) attr else valueOption.get._2) }) UpdateAction(resolvedCondition, newAssignments) case DeleteAction(condition) => @@ -405,6 +406,11 @@ case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[Logic case CreateDataSourceTableCommand(table, ignoreIfExists) if isHoodieTable(table) => CreateHoodieTableCommand(table, ignoreIfExists) + // Rewrite the AlterTableDropPartitionCommand to AlterHoodieTableDropPartitionCommand + case AlterTableDropPartitionCommand(tableName, specs, _, _, _) + if isHoodieTable(tableName, sparkSession) => + AlterHoodieTableDropPartitionCommand(tableName, specs) + // Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand // Rewrite the AlterTableAddColumnsCommand to AlterHoodieTableAddColumnsCommand case AlterTableAddColumnsCommand(tableId, colsToAdd) if isHoodieTable(tableId, sparkSession) => @@ -417,6 +423,9 @@ case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[Logic case AlterTableChangeColumnCommand(tableName, columnName, newColumn) if isHoodieTable(tableName, sparkSession) => AlterHoodieTableChangeColumnCommand(tableName, columnName, newColumn) + case ShowPartitionsCommand(tableName, specOpt) + if isHoodieTable(tableName, sparkSession) => + ShowHoodieTablePartitionsCommand(tableName, specOpt) // Rewrite TruncateTableCommand to TruncateHoodieTableCommand case TruncateTableCommand(tableName, partitionSpec) if isHoodieTable(tableName, sparkSession) => diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala new file mode 100644 index 0000000000000..7c4d45649587b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkSqlWriter, HoodieWriterUtils} +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.hudi.common.util.PartitionPathEncodeUtils +import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME +import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.execution.command.{DDLUtils, RunnableCommand} +import org.apache.spark.sql.hudi.HoodieSqlUtils._ + +case class AlterHoodieTableDropPartitionCommand( + tableIdentifier: TableIdentifier, + specs: Seq[TablePartitionSpec]) +extends RunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val catalog = sparkSession.sessionState.catalog + val table = catalog.getTableMetadata(tableIdentifier) + DDLUtils.verifyAlterTableType(catalog, table, isView = false) + + val path = getTableLocation(table, sparkSession) + val hadoopConf = sparkSession.sessionState.newHadoopConf() + val metaClient = HoodieTableMetaClient.builder().setBasePath(path).setConf(hadoopConf).build() + val partitionColumns = metaClient.getTableConfig.getPartitionFields + val normalizedSpecs: Seq[Map[String, String]] = specs.map { spec => + normalizePartitionSpec( + spec, + partitionColumns.get(), + table.identifier.quotedString, + sparkSession.sessionState.conf.resolver) + } + + val parameters = buildHoodieConfig(sparkSession, path, partitionColumns.get, normalizedSpecs) + + HoodieSparkSqlWriter.write( + sparkSession.sqlContext, + SaveMode.Append, + parameters, + sparkSession.emptyDataFrame) + + Seq.empty[Row] + } + + private def buildHoodieConfig( + sparkSession: SparkSession, + path: String, + partitionColumns: Seq[String], + normalizedSpecs: Seq[Map[String, String]]): Map[String, String] = { + val table = sparkSession.sessionState.catalog.getTableMetadata(tableIdentifier) + val allPartitionPaths = getAllPartitionPaths(sparkSession, table) + val enableHiveStylePartitioning = isHiveStyledPartitioning(allPartitionPaths, table) + val enableEncodeUrl = isUrlEncodeEnabled(allPartitionPaths, table) + val partitionsToDelete = normalizedSpecs.map { spec => + partitionColumns.map{ partitionColumn => + val encodedPartitionValue = if (enableEncodeUrl) { + PartitionPathEncodeUtils.escapePathName(spec(partitionColumn)) + } else { + spec(partitionColumn) + } + if (enableHiveStylePartitioning) { + partitionColumn + "=" + encodedPartitionValue + } else { + encodedPartitionValue + } + }.mkString("/") + }.mkString(",") + + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(path) + .setConf(sparkSession.sessionState.newHadoopConf) + .build() + val tableConfig = metaClient.getTableConfig + + val optParams = withSparkConf(sparkSession, table.storage.properties) { + Map( + "path" -> path, + TBL_NAME.key -> tableIdentifier.table, + TABLE_TYPE.key -> tableConfig.getTableType.name, + OPERATION.key -> DataSourceWriteOptions.DELETE_PARTITION_OPERATION_OPT_VAL, + PARTITIONS_TO_DELETE.key -> partitionsToDelete, + RECORDKEY_FIELD.key -> tableConfig.getRecordKeyFieldProp, + PRECOMBINE_FIELD.key -> tableConfig.getPreCombineField, + PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp + ) + } + + val parameters = HoodieWriterUtils.parametersWithWriteDefaults(optParams) + val translatedOptions = DataSourceWriteOptions.translateSqlOptions(parameters) + translatedOptions + } + + def normalizePartitionSpec[T]( + partitionSpec: Map[String, T], + partColNames: Seq[String], + tblName: String, + resolver: Resolver): Map[String, T] = { + val normalizedPartSpec = partitionSpec.toSeq.map { case (key, value) => + val normalizedKey = partColNames.find(resolver(_, key)).getOrElse { + throw new AnalysisException(s"$key is not a valid partition column in table $tblName.") + } + normalizedKey -> value + } + + if (normalizedPartSpec.size < partColNames.size) { + throw new AnalysisException( + "All partition columns need to be specified for Hoodie's dropping partition") + } + + val lowerPartColNames = partColNames.map(_.toLowerCase) + if (lowerPartColNames.distinct.length != lowerPartColNames.length) { + val duplicateColumns = lowerPartColNames.groupBy(identity).collect { + case (x, ys) if ys.length > 1 => s"`$x`" + } + throw new AnalysisException( + s"Found duplicate column(s) in the partition schema: ${duplicateColumns.mkString(", ")}") + } + + normalizedPartSpec.toMap + } + +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala index 00e8afb6099bb..8ac63126a4b93 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala @@ -19,9 +19,6 @@ package org.apache.spark.sql.hudi.command import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.apache.hudi.client.common.HoodieSparkEngineContext -import org.apache.hudi.common.config.HoodieMetadataConfig -import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.HoodieFileFormat import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ValidationUtils @@ -29,7 +26,6 @@ import org.apache.hudi.hadoop.HoodieParquetInputFormat import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport} -import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.internal.Logging import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.TableIdentifier @@ -45,8 +41,12 @@ import org.apache.spark.sql.internal.StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOL import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.{SPARK_VERSION, SparkConf} - import java.util.{Locale, Properties} + +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.keygen.ComplexKeyGenerator +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory + import scala.collection.JavaConverters._ import scala.collection.mutable @@ -94,44 +94,22 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean .setBasePath(path) .setConf(conf) .build() - val tableSchema = getTableSqlSchema(metaClient) - - // Get options from the external table and append with the options in ddl. - val originTableConfig = HoodieOptionConfig.mappingTableConfigToSqlOption( - metaClient.getTableConfig.getProps.asScala.toMap) - - val allPartitionPaths = getAllPartitionPaths(sparkSession, table) - var upgrateConfig = Map.empty[String, String] - // If this is a non-hive-styled partition table, disable the hive style config. - // (By default this config is enable for spark sql) - upgrateConfig = if (isNotHiveStyledPartitionTable(allPartitionPaths, table)) { - upgrateConfig + (DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key -> "false") - } else { - upgrateConfig - } - upgrateConfig = if (isUrlEncodeDisable(allPartitionPaths, table)) { - upgrateConfig + (DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key -> "false") - } else { - upgrateConfig - } + val tableSchema = getTableSqlSchema(metaClient) - // Use the origin keygen to generate record key to keep the rowkey consistent with the old table for spark sql. - // See SqlKeyGenerator#getRecordKey for detail. - upgrateConfig = if (originTableConfig.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key)) { - upgrateConfig + (SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> originTableConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key)) - } else { - upgrateConfig - } - val options = originTableConfig ++ upgrateConfig ++ table.storage.properties + // Get options from the external table and append with the options in ddl. + val originTableConfig = HoodieOptionConfig.mappingTableConfigToSqlOption( + metaClient.getTableConfig.getProps.asScala.toMap) + val extraConfig = extraTableConfig(sparkSession, isTableExists, originTableConfig) + val options = originTableConfig ++ table.storage.properties ++ extraConfig val userSpecifiedSchema = table.schema if (userSpecifiedSchema.isEmpty && tableSchema.isDefined) { (addMetaFields(tableSchema.get), options) } else if (userSpecifiedSchema.nonEmpty) { (addMetaFields(userSpecifiedSchema), options) - } else { + } else { throw new IllegalArgumentException(s"Missing schema for Create Table: $tableName") - } + } } else { assert(table.schema.nonEmpty, s"Missing schema for Create Table: $tableName") // SPARK-19724: the default location of a managed table should be non-existent or empty. @@ -141,7 +119,8 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean s". The associated location('$path') already exists.") } // Add the meta fields to the schema if this is a managed table or an empty external table. - (addMetaFields(table.schema), table.storage.properties) + val options = table.storage.properties ++ extraTableConfig(sparkSession, false) + (addMetaFields(table.schema), options) } val tableType = HoodieOptionConfig.getTableType(table.storage.properties) @@ -319,54 +298,42 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean } } - private def getAllPartitionPaths(spark: SparkSession, table: CatalogTable): Seq[String] = { - val sparkEngine = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) - val metadataConfig = { - val properties = new Properties() - properties.putAll((spark.sessionState.conf.getAllConfs ++ table.storage.properties).asJava) - HoodieMetadataConfig.newBuilder.fromProperties(properties).build() - } - FSUtils.getAllPartitionPaths(sparkEngine, metadataConfig, getTableLocation(table, spark)).asScala - } - - /** - * This method is used to compatible with the old non-hive-styled partition table. - * By default we enable the "hoodie.datasource.write.hive_style_partitioning" - * when writing data to hudi table by spark sql by default. - * If the exist table is a non-hive-styled partitioned table, we should - * disable the "hoodie.datasource.write.hive_style_partitioning" when - * merge or update the table. Or else, we will get an incorrect merge result - * as the partition path mismatch. - */ - private def isNotHiveStyledPartitionTable(partitionPaths: Seq[String], table: CatalogTable): Boolean = { - if (table.partitionColumnNames.nonEmpty) { - val isHiveStylePartitionPath = (path: String) => { - val fragments = path.split("/") - if (fragments.size != table.partitionColumnNames.size) { - false - } else { - fragments.zip(table.partitionColumnNames).forall { - case (pathFragment, partitionColumn) => pathFragment.startsWith(s"$partitionColumn=") - } - } + def extraTableConfig(sparkSession: SparkSession, isTableExists: Boolean, + originTableConfig: Map[String, String] = Map.empty): Map[String, String] = { + val extraConfig = mutable.Map.empty[String, String] + if (isTableExists) { + val allPartitionPaths = getAllPartitionPaths(sparkSession, table) + if (originTableConfig.contains(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key)) { + extraConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) = + originTableConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) + } else { + extraConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) = + String.valueOf(isHiveStyledPartitioning(allPartitionPaths, table)) + } + if (originTableConfig.contains(HoodieTableConfig.URL_ENCODE_PARTITIONING.key)) { + extraConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) = + originTableConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) + } else { + extraConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) = + String.valueOf(isUrlEncodeEnabled(allPartitionPaths, table)) } - !partitionPaths.forall(isHiveStylePartitionPath) } else { - false + extraConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) = "true" + extraConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) = HoodieTableConfig.URL_ENCODE_PARTITIONING.defaultValue() } - } - /** - * If this table has disable the url encode, spark sql should also disable it when writing to the table. - */ - private def isUrlEncodeDisable(partitionPaths: Seq[String], table: CatalogTable): Boolean = { - if (table.partitionColumnNames.nonEmpty) { - !partitionPaths.forall(partitionPath => partitionPath.split("/").length == table.partitionColumnNames.size) + val primaryColumns = HoodieOptionConfig.getPrimaryColumns(originTableConfig ++ table.storage.properties) + if (primaryColumns.isEmpty) { + extraConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = classOf[UuidKeyGenerator].getCanonicalName + } else if (originTableConfig.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key)) { + extraConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = + HoodieSparkKeyGeneratorFactory.convertToSparkKeyGenerator( + originTableConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key)) } else { - false + extraConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = classOf[ComplexKeyGenerator].getCanonicalName } + extraConfig.toMap } - } object CreateHoodieTableCommand extends Logging { @@ -395,6 +362,9 @@ object CreateHoodieTableCommand extends Logging { checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.PRECOMBINE_FIELD.key) checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.PARTITION_FIELDS.key) checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.RECORDKEY_FIELDS.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.URL_ENCODE_PARTITIONING.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) // Save all the table config to the hoodie.properties. val parameters = originTableConfig ++ tableOptions val properties = new Properties() diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala index 4d6d0a236b7ad..987ce0e050be4 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.hudi.command import org.apache.hudi.DataSourceWriteOptions.{OPERATION, _} +import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.hive.ddl.HiveSyncMode @@ -58,7 +59,12 @@ case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends Runnab val targetTable = sparkSession.sessionState.catalog .getTableMetadata(tableId) val path = getTableLocation(targetTable, sparkSession) - + val conf = sparkSession.sessionState.newHadoopConf() + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(path) + .setConf(conf) + .build() + val tableConfig = metaClient.getTableConfig val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties) assert(primaryColumns.nonEmpty, @@ -66,13 +72,14 @@ case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends Runnab withSparkConf(sparkSession, targetTable.storage.properties) { Map( "path" -> path, - KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, TBL_NAME.key -> tableId.table, + HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitoning, + KEYGENERATOR_CLASS_NAME.key -> tableConfig.getKeyGeneratorClassName, OPERATION.key -> DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL, PARTITIONPATH_FIELD.key -> targetTable.partitionColumnNames.mkString(","), HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HIVE_STYLE_PARTITIONING.key -> "true", HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> "200", SqlKeyGenerator.PARTITION_SCHEMA -> targetTable.partitionSchema.toDDL ) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala index e1c61ed9b03dc..2b88373115b71 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala @@ -21,12 +21,14 @@ import org.apache.avro.Schema import org.apache.avro.generic.{GenericRecord, IndexedRecord} import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecord} +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.{Option => HOption} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.exception.HoodieDuplicateKeyException import org.apache.hudi.hive.MultiPartKeysValueExtractor import org.apache.hudi.hive.ddl.HiveSyncMode +import org.apache.hudi.keygen.ComplexKeyGenerator import org.apache.hudi.sql.InsertMode import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkSqlWriter, HoodieWriterUtils} import org.apache.spark.internal.Logging @@ -90,7 +92,6 @@ object InsertIntoHoodieTableCommand extends Logging { // for insert into or insert overwrite partition we use append mode. SaveMode.Append } - val parameters = HoodieWriterUtils.parametersWithWriteDefaults(config) val conf = sparkSession.sessionState.conf val alignedQuery = alignOutputFields(query, table, insertPartitions, conf) // If we create dataframe using the Dataset.ofRows(sparkSession, alignedQuery), @@ -100,7 +101,7 @@ object InsertIntoHoodieTableCommand extends Logging { val inputDF = sparkSession.createDataFrame( Dataset.ofRows(sparkSession, alignedQuery).rdd, alignedQuery.schema) val success = - HoodieSparkSqlWriter.write(sparkSession.sqlContext, mode, parameters, inputDF)._1 + HoodieSparkSqlWriter.write(sparkSession.sqlContext, mode, config, inputDF)._1 if (success) { if (refreshTable) { sparkSession.catalog.refreshTable(table.identifier.unquotedString) @@ -197,19 +198,42 @@ object InsertIntoHoodieTableCommand extends Logging { val parameters = withSparkConf(sparkSession, options)() val tableType = parameters.getOrElse(TABLE_TYPE.key, TABLE_TYPE.defaultValue) - - val partitionFields = table.partitionColumnNames.mkString(",") - val path = getTableLocation(table, sparkSession) - - val tableSchema = table.schema - val primaryColumns = HoodieOptionConfig.getPrimaryColumns(options) + val partitionFields = table.partitionColumnNames.mkString(",") - val keyGenClass = if (primaryColumns.nonEmpty) { - classOf[SqlKeyGenerator].getCanonicalName + val path = getTableLocation(table, sparkSession) + val conf = sparkSession.sessionState.newHadoopConf() + val isTableExists = tableExistsInPath(path, conf) + val tableConfig = if (isTableExists) { + HoodieTableMetaClient.builder() + .setBasePath(path) + .setConf(conf) + .build() + .getTableConfig } else { - classOf[UuidKeyGenerator].getName + null } + val hiveStylePartitioningEnable = if (null == tableConfig || null == tableConfig.getHiveStylePartitioningEnable) { + "true" + } else { + tableConfig.getHiveStylePartitioningEnable + } + val urlEncodePartitioning = if (null == tableConfig || null == tableConfig.getUrlEncodePartitoning) { + "false" + } else { + tableConfig.getUrlEncodePartitoning + } + val keyGeneratorClassName = if (null == tableConfig || null == tableConfig.getKeyGeneratorClassName) { + if (primaryColumns.nonEmpty) { + classOf[ComplexKeyGenerator].getCanonicalName + } else { + classOf[UuidKeyGenerator].getCanonicalName + } + } else { + tableConfig.getKeyGeneratorClassName + } + + val tableSchema = table.schema val dropDuplicate = sparkSession.conf .getOption(INSERT_DROP_DUPS.key) @@ -267,7 +291,9 @@ object InsertIntoHoodieTableCommand extends Logging { TBL_NAME.key -> table.identifier.table, PRECOMBINE_FIELD.key -> tableSchema.fields.last.name, OPERATION.key -> operation, - KEYGENERATOR_CLASS_NAME.key -> keyGenClass, + HIVE_STYLE_PARTITIONING.key -> hiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> urlEncodePartitioning, + KEYGENERATOR_CLASS_NAME.key -> keyGeneratorClassName, RECORDKEY_FIELD.key -> primaryColumns.mkString(","), PARTITIONPATH_FIELD.key -> partitionFields, PAYLOAD_CLASS_NAME.key -> payloadClassName, @@ -279,10 +305,8 @@ object InsertIntoHoodieTableCommand extends Logging { HIVE_DATABASE.key -> table.identifier.database.getOrElse("default"), HIVE_TABLE.key -> table.identifier.table, HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HIVE_STYLE_PARTITIONING.key -> "true", HIVE_PARTITION_FIELDS.key -> partitionFields, HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, - URL_ENCODE_PARTITIONING.key -> "true", HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200", HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", SqlKeyGenerator.PARTITION_SCHEMA -> table.partitionSchema.toDDL diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala index c4b9aec753e7a..5ec15ce4d84fd 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hudi.command import org.apache.avro.Schema import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.hive.MultiPartKeysValueExtractor @@ -26,6 +27,7 @@ import org.apache.hudi.hive.ddl.HiveSyncMode import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkSqlWriter, HoodieWriterUtils, SparkAdapterSupport} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BoundReference, Cast, EqualTo, Expression, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command.RunnableCommand @@ -34,7 +36,6 @@ import org.apache.spark.sql.hudi.command.payload.ExpressionPayload import org.apache.spark.sql.hudi.command.payload.ExpressionPayload._ import org.apache.spark.sql.hudi.{HoodieOptionConfig, SerDeUtils} import org.apache.spark.sql.types.{BooleanType, StructType} - import java.util.Base64 /** @@ -90,6 +91,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab * TODO Currently Non-equivalent conditions are not supported. */ private lazy val targetKey2SourceExpression: Map[String, Expression] = { + val resolver = sparkSession.sessionState.conf.resolver val conditions = splitByAnd(mergeInto.mergeCondition) val allEqs = conditions.forall(p => p.isInstanceOf[EqualTo]) if (!allEqs) { @@ -101,11 +103,11 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab val target2Source = conditions.map(_.asInstanceOf[EqualTo]) .map { case EqualTo(left: AttributeReference, right) - if targetAttrs.indexOf(left) >= 0 => // left is the target field - left.name -> right + if targetAttrs.exists(f => attributeEqual(f, left, resolver)) => // left is the target field + targetAttrs.find(f => resolver(f.name, left.name)).get.name -> right case EqualTo(left, right: AttributeReference) - if targetAttrs.indexOf(right) >= 0 => // right is the target field - right.name -> left + if targetAttrs.exists(f => attributeEqual(f, right, resolver)) => // right is the target field + targetAttrs.find(f => resolver(f.name, right.name)).get.name -> left case eq => throw new AnalysisException(s"Invalidate Merge-On condition: ${eq.sql}." + "The validate condition should be 'targetColumn = sourceColumnExpression', e.g." + @@ -196,13 +198,24 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab } private def isEqualToTarget(targetColumnName: String, sourceExpression: Expression): Boolean = { + val sourceColumnName = sourceDFOutput.map(_.name) + val resolver = sparkSession.sessionState.conf.resolver + sourceExpression match { - case attr: AttributeReference if attr.name.equalsIgnoreCase(targetColumnName) => true - case Cast(attr: AttributeReference, _, _) if attr.name.equalsIgnoreCase(targetColumnName) => true + case attr: AttributeReference if sourceColumnName.find(resolver(_, attr.name)).get.equals(targetColumnName) => true + case Cast(attr: AttributeReference, _, _) if sourceColumnName.find(resolver(_, attr.name)).get.equals(targetColumnName) => true case _=> false } } + /** + * Compare a [[Attribute]] to another, return true if they have the same column name(by resolver) and exprId + */ + private def attributeEqual( + attr: Attribute, other: Attribute, resolver: Resolver): Boolean = { + resolver(attr.name, other.name) && attr.exprId == other.exprId + } + /** * Execute the update and delete action. All the matched and not-matched actions will * execute in one upsert write operation. We pushed down the matched condition and assignment @@ -359,9 +372,9 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab mergeInto.targetTable.output .filterNot(attr => isMetaField(attr.name)) .map(attr => { - val assignment = attr2Assignment.getOrElse(attr, - throw new IllegalArgumentException(s"Cannot find related assignment for field: ${attr.name}")) - castIfNeeded(assignment, attr.dataType, sparkSession.sqlContext.conf) + val assignment = attr2Assignment.find(f => attributeEqual(f._1, attr, sparkSession.sessionState.conf.resolver)) + .getOrElse(throw new IllegalArgumentException(s"Cannot find related assignment for field: ${attr.name}")) + castIfNeeded(assignment._2, attr.dataType, sparkSession.sqlContext.conf) }) } @@ -417,7 +430,12 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab val targetTableDb = targetTableIdentify.database.getOrElse("default") val targetTableName = targetTableIdentify.identifier val path = getTableLocation(targetTable, sparkSession) - + val conf = sparkSession.sessionState.newHadoopConf() + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(path) + .setConf(conf) + .build() + val tableConfig = metaClient.getTableConfig val options = targetTable.storage.properties val definedPk = HoodieOptionConfig.getPrimaryColumns(options) // TODO Currently the mergeEqualConditionKeys must be the same the primary key. @@ -427,31 +445,30 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab } // Enable the hive sync by default if spark have enable the hive metastore. val enableHive = isEnableHive(sparkSession) - HoodieWriterUtils.parametersWithWriteDefaults( - withSparkConf(sparkSession, options) { - Map( - "path" -> path, - RECORDKEY_FIELD.key -> targetKey2SourceExpression.keySet.mkString(","), - KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, - PRECOMBINE_FIELD.key -> targetKey2SourceExpression.keySet.head, // set a default preCombine field - TBL_NAME.key -> targetTableName, - PARTITIONPATH_FIELD.key -> targetTable.partitionColumnNames.mkString(","), - PAYLOAD_CLASS_NAME.key -> classOf[ExpressionPayload].getCanonicalName, - META_SYNC_ENABLED.key -> enableHive.toString, - HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), - HIVE_USE_JDBC.key -> "false", - HIVE_DATABASE.key -> targetTableDb, - HIVE_TABLE.key -> targetTableName, - HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HIVE_STYLE_PARTITIONING.key -> "true", - HIVE_PARTITION_FIELDS.key -> targetTable.partitionColumnNames.mkString(","), - HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, - URL_ENCODE_PARTITIONING.key -> "true", // enable the url decode for sql. - HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200", // set the default parallelism to 200 for sql - HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", - HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> "200", - SqlKeyGenerator.PARTITION_SCHEMA -> targetTable.partitionSchema.toDDL - ) - }) + withSparkConf(sparkSession, options) { + Map( + "path" -> path, + RECORDKEY_FIELD.key -> targetKey2SourceExpression.keySet.mkString(","), + PRECOMBINE_FIELD.key -> targetKey2SourceExpression.keySet.head, // set a default preCombine field + TBL_NAME.key -> targetTableName, + PARTITIONPATH_FIELD.key -> targetTable.partitionColumnNames.mkString(","), + PAYLOAD_CLASS_NAME.key -> classOf[ExpressionPayload].getCanonicalName, + HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitoning, + KEYGENERATOR_CLASS_NAME.key -> tableConfig.getKeyGeneratorClassName, + META_SYNC_ENABLED.key -> enableHive.toString, + HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), + HIVE_USE_JDBC.key -> "false", + HIVE_DATABASE.key -> targetTableDb, + HIVE_TABLE.key -> targetTableName, + HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", + HIVE_PARTITION_FIELDS.key -> targetTable.partitionColumnNames.mkString(","), + HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, + HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200", // set the default parallelism to 200 for sql + HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", + HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> "200", + SqlKeyGenerator.PARTITION_SCHEMA -> targetTable.partitionSchema.toDDL + ) + } } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/ShowHoodieTablePartitionsCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/ShowHoodieTablePartitionsCommand.scala new file mode 100644 index 0000000000000..1c1f4b73d0da7 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/ShowHoodieTablePartitionsCommand.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.PartitionPathEncodeUtils +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.execution.command.RunnableCommand +import org.apache.spark.sql.execution.datasources.PartitioningUtils +import org.apache.spark.sql.hudi.HoodieSqlUtils._ +import org.apache.spark.sql.types.StringType + +/** + * Command for show hudi table's partitions. + */ +case class ShowHoodieTablePartitionsCommand( + tableName: TableIdentifier, + specOpt: Option[TablePartitionSpec]) +extends RunnableCommand { + + override val output: Seq[Attribute] = { + AttributeReference("partition", StringType, nullable = false)() :: Nil + } + + override def run(sparkSession: SparkSession): Seq[Row] = { + val catalog = sparkSession.sessionState.catalog + val resolver = sparkSession.sessionState.conf.resolver + val catalogTable = catalog.getTableMetadata(tableName) + val tablePath = getTableLocation(catalogTable, sparkSession) + + val hadoopConf = sparkSession.sessionState.newHadoopConf() + val metaClient = HoodieTableMetaClient.builder().setBasePath(tablePath) + .setConf(hadoopConf).build() + val schemaOpt = getTableSqlSchema(metaClient) + val partitionColumnNamesOpt = metaClient.getTableConfig.getPartitionFields + if (partitionColumnNamesOpt.isPresent && partitionColumnNamesOpt.get.nonEmpty + && schemaOpt.isDefined && schemaOpt.nonEmpty) { + + val partitionColumnNames = partitionColumnNamesOpt.get + val schema = schemaOpt.get + val allPartitionPaths: Seq[String] = getAllPartitionPaths(sparkSession, catalogTable) + + if (specOpt.isEmpty) { + allPartitionPaths.map(Row(_)) + } else { + val spec = specOpt.get + allPartitionPaths.filter { partitionPath => + val part = PartitioningUtils.parsePathFragment(partitionPath) + spec.forall { case (col, value) => + PartitionPathEncodeUtils.escapePartitionValue(value) == part.getOrElse(col, null) + } + }.map(Row(_)) + } + } else { + Seq.empty[Row] + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala index b59984acfb988..e069df97aff55 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala @@ -18,11 +18,13 @@ package org.apache.spark.sql.hudi.command import java.util.concurrent.TimeUnit.{MICROSECONDS, MILLISECONDS} + import org.apache.avro.generic.GenericRecord import org.apache.hudi.common.config.TypedProperties import org.apache.hudi.common.util.PartitionPathEncodeUtils import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.keygen.{BaseKeyGenerator, ComplexKeyGenerator, KeyGenUtils, SparkKeyGeneratorInterface} +import org.apache.hudi.keygen._ +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StructType, TimestampType} import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} @@ -48,7 +50,8 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props) val keyGenProps = new TypedProperties() keyGenProps.putAll(props) keyGenProps.remove(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME) - keyGenProps.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, beforeKeyGenClassName) + val convertedKeyGenClassName = SqlKeyGenerator.getRealKeyGenClassName(props) + keyGenProps.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, convertedKeyGenClassName) Some(KeyGenUtils.createKeyGeneratorByClassName(keyGenProps)) } else { None @@ -64,7 +67,7 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props) } override def getRecordKey(row: Row): String = { - if (originKeyGen.isDefined && originKeyGen.get.isInstanceOf[SparkKeyGeneratorInterface]) { + if (originKeyGen.isDefined) { originKeyGen.get.asInstanceOf[SparkKeyGeneratorInterface].getRecordKey(row) } else { super.getRecordKey(row) @@ -121,4 +124,13 @@ object SqlKeyGenerator { val ORIGIN_KEYGEN_CLASS_NAME = "hoodie.sql.origin.keygen.class" private val timestampTimeFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss") private val sqlTimestampFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.S") + + def getRealKeyGenClassName(props: TypedProperties): String = { + val beforeKeyGenClassName = props.getString(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME, null) + if (beforeKeyGenClassName != null) { + HoodieSparkKeyGeneratorFactory.convertToSparkKeyGenerator(beforeKeyGenClassName) + } else { + classOf[ComplexKeyGenerator].getCanonicalName + } + } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala index 20a827400ee4c..b1c8a04429e27 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hudi.command import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.hive.MultiPartKeysValueExtractor @@ -85,7 +86,12 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCo val targetTable = sparkSession.sessionState.catalog .getTableMetadata(tableId) val path = getTableLocation(targetTable, sparkSession) - + val conf = sparkSession.sessionState.newHadoopConf() + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(path) + .setConf(conf) + .build() + val tableConfig = metaClient.getTableConfig val primaryColumns = HoodieOptionConfig.getPrimaryColumns(targetTable.storage.properties) assert(primaryColumns.nonEmpty, @@ -95,9 +101,11 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCo Map( "path" -> path, RECORDKEY_FIELD.key -> primaryColumns.mkString(","), - KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, PRECOMBINE_FIELD.key -> primaryColumns.head, //set the default preCombine field. TBL_NAME.key -> tableId.table, + HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitoning, + KEYGENERATOR_CLASS_NAME.key -> tableConfig.getKeyGeneratorClassName, OPERATION.key -> DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, PARTITIONPATH_FIELD.key -> targetTable.partitionColumnNames.mkString(","), META_SYNC_ENABLED.key -> enableHive.toString, @@ -107,9 +115,7 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCo HIVE_TABLE.key -> tableId.table, HIVE_PARTITION_FIELDS.key -> targetTable.partitionColumnNames.mkString(","), HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, - URL_ENCODE_PARTITIONING.key -> "true", HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HIVE_STYLE_PARTITIONING.key -> "true", HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", SqlKeyGenerator.PARTITION_SCHEMA -> targetTable.partitionSchema.toDDL ) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala index ea55127d4416a..b025cf3efa443 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala @@ -31,7 +31,7 @@ import org.apache.hudi.common.util.{ValidationUtils, Option => HOption} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.io.HoodieWriteHandle import org.apache.hudi.sql.IExpressionEvaluator -import org.apache.spark.sql.avro.{AvroSerializer, SchemaConverters} +import org.apache.spark.sql.avro.{AvroSerializer, HoodieAvroSerializer, SchemaConverters} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.hudi.SerDeUtils import org.apache.spark.sql.hudi.command.payload.ExpressionPayload.getEvaluator @@ -310,7 +310,7 @@ object ExpressionPayload { val conditionEvaluator = ExpressionCodeGen.doCodeGen(Seq(condition), conditionSerializer) val assignSqlType = SchemaConverters.toSqlType(writeSchema).dataType.asInstanceOf[StructType] - val assignSerializer = new AvroSerializer(assignSqlType, writeSchema, false) + val assignSerializer = new HoodieAvroSerializer(assignSqlType, writeSchema, false) val assignmentEvaluator = ExpressionCodeGen.doCodeGen(assignments, assignSerializer) conditionEvaluator -> assignmentEvaluator } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala index 2a12e9227a15a..3fb48f430221a 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.hudi.command.payload import org.apache.avro.generic.IndexedRecord import org.apache.avro.Schema -import org.apache.spark.sql.avro.{AvroDeserializer, SchemaConverters} +import org.apache.spark.sql.avro.{HooodieAvroDeserializer, SchemaConverters} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ @@ -29,8 +29,8 @@ import org.apache.spark.sql.types._ class SqlTypedRecord(val record: IndexedRecord) extends IndexedRecord { private lazy val sqlType = SchemaConverters.toSqlType(getSchema).dataType.asInstanceOf[StructType] - private lazy val avroDeserializer = new AvroDeserializer(record.getSchema, sqlType) - private lazy val sqlRow = avroDeserializer.deserialize(record).asInstanceOf[InternalRow] + private lazy val avroDeserializer = HooodieAvroDeserializer(record.getSchema, sqlType) + private lazy val sqlRow = avroDeserializer.deserializeData(record).asInstanceOf[InternalRow] override def put(i: Int, v: Any): Unit = { record.put(i, v) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala index 0482e74884926..a60a63b7a7d7b 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala @@ -179,10 +179,10 @@ class HoodieStreamSource( startOffset match { case INIT_OFFSET => startOffset.commitTime case HoodieSourceOffset(commitTime) => - val time = HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime + val time = HoodieActiveTimeline.parseInstantTime(commitTime).getTime // As we consume the data between (start, end], start is not included, // so we +1s to the start commit time here. - HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date(time + 1000)) + HoodieActiveTimeline.formatInstantTime(new Date(time + 1000)) case _=> throw new IllegalStateException("UnKnow offset type.") } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieCommonSqlParser.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieCommonSqlParser.scala index 4f410c7d67fb8..f830c515be782 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieCommonSqlParser.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieCommonSqlParser.scala @@ -62,7 +62,7 @@ class HoodieCommonSqlParser(session: SparkSession, delegate: ParserInterface) } def parseMultipartIdentifier(sqlText: String): Seq[String] = { - throw new UnsupportedOperationException(s"Unsupported parseMultipartIdentifier method") + sparkAdapter.parseMultipartIdentifier(delegate, sqlText) } protected def parse[T](command: String)(toResult: HoodieSqlCommonParser => T): T = { diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 081a8e4e61a48..6353aa2165123 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -27,6 +28,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; +import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.avro.Conversions; @@ -41,6 +43,8 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.mockito.ArgumentCaptor; import org.mockito.Captor; import org.mockito.Mock; @@ -49,13 +53,18 @@ import java.math.BigDecimal; import java.time.LocalDate; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; +import static org.apache.hudi.hive.ddl.HiveSyncMode.HMS; import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.times; @@ -65,6 +74,9 @@ @ExtendWith(MockitoExtension.class) public class TestDataSourceUtils { + private static final String HIVE_DATABASE = "testdb1"; + private static final String HIVE_TABLE = "hive_trips"; + @Mock private SparkRDDWriteClient hoodieWriteClient; @@ -199,6 +211,29 @@ public void testCreateRDDCustomColumnsSortPartitionerWithValidPartitioner() thro assertThat(partitioner.isPresent(), is(true)); } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testBuildHiveSyncConfig(boolean useSyncMode) { + TypedProperties props = new TypedProperties(); + if (useSyncMode) { + props.setProperty(DataSourceWriteOptions.HIVE_SYNC_MODE().key(), HMS.name()); + props.setProperty(DataSourceWriteOptions.HIVE_USE_JDBC().key(), String.valueOf(false)); + } + props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), HIVE_DATABASE); + props.setProperty(DataSourceWriteOptions.HIVE_TABLE().key(), HIVE_TABLE); + HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, config.getBasePath(), PARQUET.name()); + + if (useSyncMode) { + assertFalse(hiveSyncConfig.useJdbc); + assertEquals(HMS.name(), hiveSyncConfig.syncMode); + } else { + assertTrue(hiveSyncConfig.useJdbc); + assertNull(hiveSyncConfig.syncMode); + } + assertEquals(HIVE_DATABASE, hiveSyncConfig.databaseName); + assertEquals(HIVE_TABLE, hiveSyncConfig.tableName); + } + private void setAndVerifyHoodieWriteClientWith(final String partitionerClassName) { config = HoodieWriteConfig.newBuilder().withPath(config.getBasePath()) .withUserDefinedBulkInsertPartitionerClass(partitionerClassName) diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java index 5cbcf6cf87279..2e89baa70b8bf 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java @@ -35,6 +35,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -195,9 +196,9 @@ private enum EffectiveMode { private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, EffectiveMode mode) throws Exception { if (deltaCommit) { - metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, bootstrapBasePath); + metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, bootstrapBasePath, true); } else { - metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, bootstrapBasePath); + metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, bootstrapBasePath, true); } int totalRecords = 100; @@ -251,6 +252,7 @@ private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, Effec .withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()) .withBootstrapParallelism(3) .withBootstrapModeSelector(bootstrapModeSelectorClass).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .build(); SparkRDDWriteClient client = new SparkRDDWriteClient(context, config); client.bootstrap(Option.empty()); @@ -258,8 +260,9 @@ private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, Effec numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants); // Rollback Bootstrap - FSUtils.deleteInstantFile(metaClient.getFs(), metaClient.getMetaPath(), new HoodieInstant(State.COMPLETED, + HoodieActiveTimeline.deleteInstantFile(metaClient.getFs(), metaClient.getMetaPath(), new HoodieInstant(State.COMPLETED, deltaCommit ? HoodieTimeline.DELTA_COMMIT_ACTION : HoodieTimeline.COMMIT_ACTION, bootstrapCommitInstantTs)); + metaClient.reloadActiveTimeline(); client.rollbackFailedBootstrap(); metaClient.reloadActiveTimeline(); assertEquals(0, metaClient.getCommitsTimeline().countInstants()); @@ -374,8 +377,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta reloadInputFormats(); List records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( jsc.hadoopConfiguration(), - FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(), false).stream() + FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>()); assertEquals(totalRecords, records.size()); @@ -393,8 +395,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta seenKeys = new HashSet<>(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( jsc.hadoopConfiguration(), - FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(), false).stream() + FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>()); assertEquals(totalRecords, records.size()); @@ -410,8 +411,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta reloadInputFormats(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( jsc.hadoopConfiguration(), - FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(), false).stream() + FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true, HoodieRecord.HOODIE_META_COLUMNS); @@ -428,8 +428,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta seenKeys = new HashSet<>(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( jsc.hadoopConfiguration(), - FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(), false).stream() + FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true, HoodieRecord.HOODIE_META_COLUMNS); @@ -444,8 +443,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta reloadInputFormats(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( jsc.hadoopConfiguration(), - FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(), false).stream() + FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true, Arrays.asList("_row_key")); @@ -462,8 +460,7 @@ private void checkBootstrapResults(int totalRecords, Schema schema, String insta seenKeys = new HashSet<>(); records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( jsc.hadoopConfiguration(), - FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(), false).stream() + FSUtils.getAllPartitionPaths(context, basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false).stream() .map(f -> basePath + "/" + f).collect(Collectors.toList()), basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true, Arrays.asList("_row_key")); @@ -550,6 +547,7 @@ public Map> select(List { + if (i % 2 == 0) { + testBulkInsertHelperFor("_row_key"); + } else { + testBulkInsertHelperFor("ts"); + } + }); + } + @Test public void testBulkInsertHelper() { - HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet()).combineInput(false, false).build(); + testBulkInsertHelperFor("_row_key"); + } + + private void testBulkInsertHelperFor(String recordKey) { + HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet(recordKey)).combineInput(false, false).build(); List rows = DataSourceTestUtils.generateRandomRows(10); Dataset dataset = sqlContext.createDataFrame(rows, structType); Dataset result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", @@ -106,7 +122,7 @@ public void testBulkInsertHelper() { } result.toJavaRDD().foreach(entry -> { - assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD)).equals(entry.getAs("_row_key"))); + assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD)).equals(entry.getAs(recordKey).toString())); assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).equals(entry.getAs("partition"))); assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD)).equals("")); assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).equals("")); @@ -148,7 +164,8 @@ public void testBulkInsertHelperNoMetaFields() { @ParameterizedTest @MethodSource("providePreCombineArgs") public void testBulkInsertPreCombine(boolean enablePreCombine) { - HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet()).combineInput(enablePreCombine, enablePreCombine) + HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet("_row_key")) + .combineInput(enablePreCombine, enablePreCombine) .withPreCombineField("ts").build(); List inserts = DataSourceTestUtils.generateRandomRows(10); Dataset toUpdateDataset = sqlContext.createDataFrame(inserts.subList(0, 5), structType); @@ -207,22 +224,27 @@ public void testBulkInsertPreCombine(boolean enablePreCombine) { } } - private Map getPropsAllSet() { - return getProps(true, true, true, true); + private Map getPropsAllSet(String recordKey) { + return getProps(recordKey, true, true, true, true); } private Map getProps(boolean setAll, boolean setKeyGen, boolean setRecordKey, boolean setPartitionPath) { + return getProps("_row_key", setAll, setKeyGen, setRecordKey, setPartitionPath); + } + + private Map getProps(String recordKey, boolean setAll, boolean setKeyGen, boolean setRecordKey, boolean setPartitionPath) { Map props = new HashMap<>(); if (setAll) { props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), "org.apache.hudi.keygen.SimpleKeyGenerator"); - props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key"); + props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey); props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition"); + props.put(HoodieWriteConfig.TBL_NAME.key(), recordKey + "_table"); } else { if (setKeyGen) { props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), "org.apache.hudi.keygen.SimpleKeyGenerator"); } if (setRecordKey) { - props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key"); + props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey); } if (setPartitionPath) { props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition"); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java new file mode 100644 index 0000000000000..fba09091add50 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java @@ -0,0 +1,503 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.functional; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.bootstrap.BootstrapMode; +import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; +import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector; +import org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector; +import org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.bootstrap.index.BootstrapIndex; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.OrcReaderIterator; +import org.apache.hudi.common.util.PartitionPathEncodeUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieBootstrapConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.apache.hudi.index.HoodieIndex.IndexType; +import org.apache.hudi.keygen.NonpartitionedKeyGenerator; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.table.action.bootstrap.BootstrapUtils; +import org.apache.hudi.testutils.HoodieClientTestBase; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.api.java.UDF1; +import org.apache.spark.sql.types.DataTypes; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Spliterators; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; + +import static java.util.stream.Collectors.mapping; +import static java.util.stream.Collectors.toList; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.generateGenericRecord; +import static org.apache.spark.sql.functions.callUDF; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests Bootstrap Client functionality. + */ +@Tag("functional") +public class TestOrcBootstrap extends HoodieClientTestBase { + + + public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,double,double,double,double," + + "struct,array>,boolean"; + @TempDir + public java.nio.file.Path tmpFolder; + + protected String bootstrapBasePath = null; + + private HoodieParquetInputFormat roInputFormat; + private JobConf roJobConf; + + private HoodieParquetRealtimeInputFormat rtInputFormat; + private JobConf rtJobConf; + private SparkSession spark; + + @BeforeEach + public void setUp() throws Exception { + bootstrapBasePath = tmpFolder.toAbsolutePath().toString() + "/data"; + initPath(); + initSparkContexts(); + initTestDataGenerator(); + initMetaClient(); + reloadInputFormats(); + } + + @AfterEach + public void tearDown() throws IOException { + cleanupSparkContexts(); + cleanupClients(); + cleanupTestDataGenerator(); + } + + private void reloadInputFormats() { + // initialize parquet input format + roInputFormat = new HoodieParquetInputFormat(); + roJobConf = new JobConf(jsc.hadoopConfiguration()); + roInputFormat.setConf(roJobConf); + + } + + public Schema generateNewDataSetAndReturnSchema(long timestamp, int numRecords, List partitionPaths, + String srcPath) throws Exception { + boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty(); + Dataset df = generateTestRawTripDataset(timestamp, 0, numRecords, partitionPaths, jsc, sqlContext); + df.printSchema(); + if (isPartitioned) { + df.write().partitionBy("datestr").format("orc").mode(SaveMode.Overwrite).save(srcPath); + } else { + df.write().format("orc").mode(SaveMode.Overwrite).save(srcPath); + } + String filePath = FileStatusUtils.toPath(BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), + srcPath, context).stream().findAny().map(p -> p.getValue().stream().findAny()) + .orElse(null).get().getPath()).toString(); + Reader orcReader = OrcFile.createReader(new Path(filePath), OrcFile.readerOptions(metaClient.getHadoopConf())); + + TypeDescription orcSchema = orcReader.getSchema(); + + return AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, "test_orc_record", null, true); + } + + @Test + public void testMetadataBootstrapUnpartitionedCOW() throws Exception { + testBootstrapCommon(false, false, EffectiveMode.METADATA_BOOTSTRAP_MODE); + } + + @Test + public void testMetadataBootstrapWithUpdatesCOW() throws Exception { + testBootstrapCommon(true, false, EffectiveMode.METADATA_BOOTSTRAP_MODE); + } + + private enum EffectiveMode { + FULL_BOOTSTRAP_MODE, + METADATA_BOOTSTRAP_MODE, + MIXED_BOOTSTRAP_MODE + } + + private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, EffectiveMode mode) throws Exception { + + if (deltaCommit) { + metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, bootstrapBasePath, HoodieFileFormat.ORC); + } else { + metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, bootstrapBasePath, HoodieFileFormat.ORC); + } + + int totalRecords = 100; + String keyGeneratorClass = partitioned ? SimpleKeyGenerator.class.getCanonicalName() + : NonpartitionedKeyGenerator.class.getCanonicalName(); + final String bootstrapModeSelectorClass; + final String bootstrapCommitInstantTs; + final boolean checkNumRawFiles; + final boolean isBootstrapIndexCreated; + final int numInstantsAfterBootstrap; + final List bootstrapInstants; + switch (mode) { + case FULL_BOOTSTRAP_MODE: + bootstrapModeSelectorClass = FullRecordBootstrapModeSelector.class.getCanonicalName(); + bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS; + checkNumRawFiles = false; + isBootstrapIndexCreated = false; + numInstantsAfterBootstrap = 1; + bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs); + break; + case METADATA_BOOTSTRAP_MODE: + bootstrapModeSelectorClass = MetadataOnlyBootstrapModeSelector.class.getCanonicalName(); + bootstrapCommitInstantTs = HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS; + checkNumRawFiles = true; + isBootstrapIndexCreated = true; + numInstantsAfterBootstrap = 1; + bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs); + break; + default: + bootstrapModeSelectorClass = TestRandomBootstapModeSelector.class.getName(); + bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS; + checkNumRawFiles = false; + isBootstrapIndexCreated = true; + numInstantsAfterBootstrap = 2; + bootstrapInstants = Arrays.asList(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, + HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS); + break; + } + List partitions = Arrays.asList("2020/04/01", "2020/04/02", "2020/04/03"); + long timestamp = Instant.now().toEpochMilli(); + Schema schema = generateNewDataSetAndReturnSchema(timestamp, totalRecords, partitions, bootstrapBasePath); + HoodieWriteConfig config = getConfigBuilder(schema.toString()) + .withAutoCommit(true) + .withSchema(schema.toString()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1) + .build()) + .withBootstrapConfig(HoodieBootstrapConfig.newBuilder() + .withBootstrapBasePath(bootstrapBasePath) + .withBootstrapKeyGenClass(keyGeneratorClass) + .withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()) + .withBootstrapParallelism(3) + .withBootstrapModeSelector(bootstrapModeSelectorClass).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); + SparkRDDWriteClient client = new SparkRDDWriteClient(context, config); + client.bootstrap(Option.empty()); + checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, + numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants); + + // Rollback Bootstrap + if (deltaCommit) { + FileCreateUtils.deleteDeltaCommit(metaClient.getBasePath(), bootstrapCommitInstantTs); + } else { + FileCreateUtils.deleteCommit(metaClient.getBasePath(), bootstrapCommitInstantTs); + } + client.rollbackFailedBootstrap(); + metaClient.reloadActiveTimeline(); + assertEquals(0, metaClient.getCommitsTimeline().countInstants()); + assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, context) + .stream().flatMap(f -> f.getValue().stream()).count()); + + BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient); + assertFalse(index.useIndex()); + + // Run bootstrap again + client = new SparkRDDWriteClient(context, config); + client.bootstrap(Option.empty()); + + metaClient.reloadActiveTimeline(); + index = BootstrapIndex.getBootstrapIndex(metaClient); + if (isBootstrapIndexCreated) { + assertTrue(index.useIndex()); + } else { + assertFalse(index.useIndex()); + } + + checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, + numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants); + + // Upsert case + long updateTimestamp = Instant.now().toEpochMilli(); + String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2"; + generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath); + JavaRDD updateBatch = + generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, context), + schema); + String newInstantTs = client.startCommit(); + client.upsert(updateBatch, newInstantTs); + checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1, + updateTimestamp, deltaCommit ? timestamp : updateTimestamp, deltaCommit); + + if (deltaCommit) { + Option compactionInstant = client.scheduleCompaction(Option.empty()); + assertTrue(compactionInstant.isPresent()); + client.compact(compactionInstant.get()); + checkBootstrapResults(totalRecords, schema, compactionInstant.get(), checkNumRawFiles, + numInstantsAfterBootstrap + 2, 2, updateTimestamp, updateTimestamp, !deltaCommit, + Arrays.asList(compactionInstant.get())); + } + } + + @Test + public void testMetadataBootstrapWithUpdatesMOR() throws Exception { + testBootstrapCommon(true, true, EffectiveMode.METADATA_BOOTSTRAP_MODE); + } + + @Test + public void testFullBootstrapOnlyCOW() throws Exception { + testBootstrapCommon(true, false, EffectiveMode.FULL_BOOTSTRAP_MODE); + } + + @Test + public void testFullBootstrapWithUpdatesMOR() throws Exception { + testBootstrapCommon(true, true, EffectiveMode.FULL_BOOTSTRAP_MODE); + } + + @Test + public void testMetaAndFullBootstrapCOW() throws Exception { + testBootstrapCommon(true, false, EffectiveMode.MIXED_BOOTSTRAP_MODE); + } + + @Test + public void testMetadataAndFullBootstrapWithUpdatesMOR() throws Exception { + testBootstrapCommon(true, true, EffectiveMode.MIXED_BOOTSTRAP_MODE); + } + + private void checkBootstrapResults(int totalRecords, Schema schema, String maxInstant, boolean checkNumRawFiles, + int expNumInstants, long expTimestamp, long expROTimestamp, boolean isDeltaCommit) throws Exception { + checkBootstrapResults(totalRecords, schema, maxInstant, checkNumRawFiles, expNumInstants, expNumInstants, + expTimestamp, expROTimestamp, isDeltaCommit, Arrays.asList(maxInstant)); + } + + private void checkBootstrapResults(int totalRecords, Schema schema, String instant, boolean checkNumRawFiles, + int expNumInstants, int numVersions, long expTimestamp, long expROTimestamp, boolean isDeltaCommit, + List instantsWithValidRecords) throws Exception { + metaClient.reloadActiveTimeline(); + assertEquals(expNumInstants, metaClient.getCommitsTimeline().filterCompletedInstants().countInstants()); + assertEquals(instant, metaClient.getActiveTimeline() + .getCommitsTimeline().filterCompletedInstants().lastInstant().get().getTimestamp()); + + Dataset bootstrapped = sqlContext.read().format("orc").load(basePath); + Dataset original = sqlContext.read().format("orc").load(bootstrapBasePath); + bootstrapped.registerTempTable("bootstrapped"); + original.registerTempTable("original"); + if (checkNumRawFiles) { + List files = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), + bootstrapBasePath, context).stream().flatMap(x -> x.getValue().stream()).collect(Collectors.toList()); + assertEquals(files.size() * numVersions, + sqlContext.sql("select distinct _hoodie_file_name from bootstrapped").count()); + } + + if (!isDeltaCommit) { + String predicate = String.join(", ", + instantsWithValidRecords.stream().map(p -> "\"" + p + "\"").collect(Collectors.toList())); + assertEquals(totalRecords, sqlContext.sql("select * from bootstrapped where _hoodie_commit_time IN " + + "(" + predicate + ")").count()); + Dataset missingOriginal = sqlContext.sql("select a._row_key from original a where a._row_key not " + + "in (select _hoodie_record_key from bootstrapped)"); + assertEquals(0, missingOriginal.count()); + Dataset missingBootstrapped = sqlContext.sql("select a._hoodie_record_key from bootstrapped a " + + "where a._hoodie_record_key not in (select _row_key from original)"); + assertEquals(0, missingBootstrapped.count()); + //sqlContext.sql("select * from bootstrapped").show(10, false); + } + + + } + + public static class TestFullBootstrapDataProvider extends FullRecordBootstrapDataProvider> { + + public TestFullBootstrapDataProvider(TypedProperties props, HoodieSparkEngineContext context) { + super(props, context); + } + + @Override + public JavaRDD generateInputRecords(String tableName, String sourceBasePath, + List>> partitionPaths) { + String[] filePaths = partitionPaths.stream().map(Pair::getValue) + .flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())) + .toArray(String[]::new); + + JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); + + String filePath = FileStatusUtils.toPath(partitionPaths.stream().flatMap(p -> p.getValue().stream()) + .findAny().get().getPath()).toString(); + try { + Reader orcReader = OrcFile.createReader( + new Path(filePath), new OrcFile.ReaderOptions(jsc.hadoopConfiguration())); + + TypeDescription orcSchema = orcReader.getSchema(); + Schema avroSchema = AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, "test_orc_record", null, true); + return generateInputBatch(jsc, partitionPaths, avroSchema); + + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + } + } + + private static JavaRDD generateInputBatch(JavaSparkContext jsc, + List>> partitionPaths, Schema writerSchema) { + List> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream() + .map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList()); + return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> { + try { + Configuration conf = jsc.hadoopConfiguration(); + AvroReadSupport.setAvroReadSchema(conf, writerSchema); + Reader orcReader = OrcFile.createReader( + p.getValue(), + new OrcFile.ReaderOptions(jsc.hadoopConfiguration())); + RecordReader recordReader = orcReader.rows(); + + TypeDescription orcSchema = orcReader.getSchema(); + + Schema avroSchema = AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, "test_orc_record", null, true); + + Iterator recIterator = new OrcReaderIterator(recordReader, avroSchema, orcSchema); + + return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> { + try { + String key = gr.get("_row_key").toString(); + String pPath = p.getKey(); + return new HoodieRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, + HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + }); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + }).collect(Collectors.toList())); + } + + public static class TestRandomBootstapModeSelector extends BootstrapModeSelector { + private int currIdx = new Random().nextInt(2); + + public TestRandomBootstapModeSelector(HoodieWriteConfig writeConfig) { + super(writeConfig); + } + + @Override + public Map> select(List>> partitions) { + List> selections = new ArrayList<>(); + partitions.stream().forEach(p -> { + final BootstrapMode mode; + if (currIdx == 0) { + mode = BootstrapMode.METADATA_ONLY; + } else { + mode = BootstrapMode.FULL_RECORD; + } + currIdx = (currIdx + 1) % 2; + selections.add(Pair.of(mode, p.getKey())); + }); + return selections.stream().collect(Collectors.groupingBy(Pair::getKey, mapping(Pair::getValue, toList()))); + } + } + + public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr) { + HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr, IndexType.BLOOM) + .withExternalSchemaTrasformation(true); + TypedProperties properties = new TypedProperties(); + properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key"); + properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "datestr"); + builder = builder.withProps(properties); + return builder; + } + + public static Dataset generateTestRawTripDataset(long timestamp, int from, int to, List partitionPaths, + JavaSparkContext jsc, SQLContext sqlContext) { + boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty(); + final List records = new ArrayList<>(); + IntStream.range(from, to).forEach(i -> { + String id = "" + i; + records.add(generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, + timestamp, false, false).toString()); + }); + if (isPartitioned) { + sqlContext.udf().register("partgen", + (UDF1) (val) -> PartitionPathEncodeUtils.escapePathName(partitionPaths.get( + Integer.parseInt(val.split("_")[1]) % partitionPaths.size())), + DataTypes.StringType); + } + JavaRDD rdd = jsc.parallelize(records); + Dataset df = sqlContext.read().json(rdd); + if (isPartitioned) { + df = df.withColumn("datestr", callUDF("partgen", new Column("_row_key"))); + // Order the columns to ensure generated avro schema aligns with Hive schema + df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", + "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted", "datestr"); + } else { + // Order the columns to ensure generated avro schema aligns with Hive schema + df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", + "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted"); + } + return df; + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/HoodieSparkSqlWriterSuite.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/HoodieSparkSqlWriterSuite.scala index ac2d8991ffb1f..96fb18db36728 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/HoodieSparkSqlWriterSuite.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/HoodieSparkSqlWriterSuite.scala @@ -19,7 +19,7 @@ package org.apache.hudi import org.apache.commons.io.FileUtils import org.apache.hadoop.fs.Path -import org.apache.hudi.DataSourceWriteOptions.{INSERT_DROP_DUPS, INSERT_OPERATION_OPT_VAL, MOR_TABLE_TYPE_OPT_VAL, OPERATION, TABLE_TYPE} +import org.apache.hudi.DataSourceWriteOptions.{INSERT_DROP_DUPS, INSERT_OPERATION_OPT_VAL, KEYGENERATOR_CLASS_NAME, MOR_TABLE_TYPE_OPT_VAL, OPERATION, PAYLOAD_CLASS_NAME, PRECOMBINE_FIELD, RECORDKEY_FIELD, TABLE_TYPE} import org.apache.hudi.client.SparkRDDWriteClient import org.apache.hudi.common.config.HoodieConfig import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord, HoodieRecordPayload, HoodieTableType, WriteOperationType} @@ -30,7 +30,7 @@ import org.apache.hudi.exception.HoodieException import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode import org.apache.hudi.functional.TestBootstrap import org.apache.hudi.hive.HiveSyncConfig -import org.apache.hudi.keygen.{NonpartitionedKeyGenerator, SimpleKeyGenerator} +import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.testutils.DataSourceTestUtils import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext @@ -48,8 +48,10 @@ import org.scalatest.Matchers.{assertResult, be, convertToAnyShouldWrapper, inte import java.time.Instant import java.util import java.util.{Collections, Date, UUID} + import scala.collection.JavaConversions._ import scala.collection.JavaConverters +import scala.util.control.NonFatal /** * Test suite for SparkSqlWriter class. @@ -161,7 +163,6 @@ class HoodieSparkSqlWriterSuite { .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") .updated(HoodieTableConfig.POPULATE_META_FIELDS.key(), String.valueOf(populateMetaFields)) .updated(HoodieWriteConfig.BULK_INSERT_SORT_MODE.key(), sortMode.name()) - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) // generate the inserts val schema = DataSourceTestUtils.getStructTypeExampleSchema @@ -175,7 +176,7 @@ class HoodieSparkSqlWriterSuite { val recordsSeq = convertRowListToSeq(records) val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) // write to Hudi - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) // collect all partition paths to issue read of parquet files val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, @@ -242,21 +243,19 @@ class HoodieSparkSqlWriterSuite { //create a new table val fooTableModifier = Map("path" -> tempBasePath, HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, "hoodie.insert.shuffle.parallelism" -> "4", "hoodie.upsert.shuffle.parallelism" -> "4") - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) val dataFrame = spark.createDataFrame(Seq(StringLongTest(UUID.randomUUID().toString, new Date().getTime))) - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, dataFrame) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, dataFrame) //on same path try append with different("hoodie_bar_tbl") table name which should throw an exception val barTableModifier = Map("path" -> tempBasePath, HoodieWriteConfig.TBL_NAME.key -> "hoodie_bar_tbl", "hoodie.insert.shuffle.parallelism" -> "4", "hoodie.upsert.shuffle.parallelism" -> "4") - val barTableParams = HoodieWriterUtils.parametersWithWriteDefaults(barTableModifier) val dataFrame2 = spark.createDataFrame(Seq(StringLongTest(UUID.randomUUID().toString, new Date().getTime))) - val tableAlreadyExistException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, barTableParams, dataFrame2)) + val tableAlreadyExistException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, barTableModifier, dataFrame2)) assert(tableAlreadyExistException.getMessage.contains("hoodie table with name " + hoodieFooTableName + " already exist")) //on same path try append with delete operation and different("hoodie_bar_tbl") table name which should throw an exception - val deleteTableParams = barTableParams ++ Map(OPERATION.key -> "delete") - val deleteCmdException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, deleteTableParams, dataFrame2)) + val deleteTableModifier = barTableModifier ++ Map(OPERATION.key -> "delete") + val deleteCmdException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, deleteTableModifier, dataFrame2)) assert(deleteCmdException.getMessage.contains("hoodie table with name " + hoodieFooTableName + " already exist")) } @@ -295,7 +294,6 @@ class HoodieSparkSqlWriterSuite { .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") .updated(HoodieWriteConfig.BULK_INSERT_SORT_MODE.key(), BulkInsertSortMode.NONE.name()) - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) // generate the inserts val schema = DataSourceTestUtils.getStructTypeExampleSchema @@ -304,7 +302,7 @@ class HoodieSparkSqlWriterSuite { val df = spark.createDataFrame(sc.parallelize(inserts), structType) try { // write to Hudi - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) Assertions.fail("Should have thrown exception") } catch { case e: HoodieException => assertTrue(e.getMessage.contains("hoodie.populate.meta.fields already disabled for the table. Can't be re-enabled back")) @@ -323,7 +321,6 @@ class HoodieSparkSqlWriterSuite { .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") .updated(INSERT_DROP_DUPS.key, "true") - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) // generate the inserts val schema = DataSourceTestUtils.getStructTypeExampleSchema @@ -332,7 +329,7 @@ class HoodieSparkSqlWriterSuite { val recordsSeq = convertRowListToSeq(records) val df = spark.createDataFrame(spark.sparkContext.parallelize(recordsSeq), structType) // write to Hudi - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) fail("Drop duplicates with bulk insert in row writing should have thrown exception") } catch { case e: HoodieException => assertTrue(e.getMessage.contains("Dropping duplicates with bulk_insert in row writer path is not supported yet")) @@ -348,7 +345,6 @@ class HoodieSparkSqlWriterSuite { //create a new table val fooTableModifier = commonTableModifier.updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .updated(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "false") - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) // generate the inserts val schema = DataSourceTestUtils.getStructTypeExampleSchema @@ -357,7 +353,7 @@ class HoodieSparkSqlWriterSuite { val recordsSeq = convertRowListToSeq(records) val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) // write to Hudi - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams - DataSourceWriteOptions.PRECOMBINE_FIELD.key, df) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier - DataSourceWriteOptions.PRECOMBINE_FIELD.key, df) // collect all partition paths to issue read of parquet files val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, @@ -384,7 +380,6 @@ class HoodieSparkSqlWriterSuite { val fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4") .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) val fullPartitionPaths = new Array[String](3) @@ -400,7 +395,7 @@ class HoodieSparkSqlWriterSuite { val recordsSeq = convertRowListToSeq(records) val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) // write to Hudi - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) // Fetch records from entire dataset val actualDf = sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2)) // remove metadata columns so that expected and actual DFs can be compared as is @@ -445,11 +440,12 @@ class HoodieSparkSqlWriterSuite { val records = DataSourceTestUtils.generateRandomRows(100) val recordsSeq = convertRowListToSeq(records) val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + initializeMetaClientForBootstrap(fooTableParams, tableType, false) val client = spy(DataSourceUtils.createHoodieClient( new JavaSparkContext(sc), modifiedSchema.toString, tempBasePath, hoodieFooTableName, mapAsJavaMap(fooTableParams)).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]]) - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df, Option.empty, Option(client)) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df, Option.empty, Option(client)) // Verify that asynchronous compaction is not scheduled verify(client, times(0)).scheduleCompaction(any()) // Verify that HoodieWriteClient is closed correctly @@ -501,6 +497,7 @@ class HoodieSparkSqlWriterSuite { DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", HoodieBootstrapConfig.KEYGEN_CLASS_NAME.key -> classOf[NonpartitionedKeyGenerator].getCanonicalName) val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) + initializeMetaClientForBootstrap(fooTableParams, tableType, true) val client = spy(DataSourceUtils.createHoodieClient( new JavaSparkContext(sc), @@ -509,7 +506,7 @@ class HoodieSparkSqlWriterSuite { hoodieFooTableName, mapAsJavaMap(fooTableParams)).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]]) - HoodieSparkSqlWriter.bootstrap(sqlContext, SaveMode.Append, fooTableParams, spark.emptyDataFrame, Option.empty, + HoodieSparkSqlWriter.bootstrap(sqlContext, SaveMode.Append, fooTableModifier, spark.emptyDataFrame, Option.empty, Option(client)) // Verify that HoodieWriteClient is closed correctly @@ -522,6 +519,27 @@ class HoodieSparkSqlWriterSuite { } } + def initializeMetaClientForBootstrap(fooTableParams : Map[String, String], tableType: String, addBootstrapPath : Boolean) : Unit = { + // when metadata is enabled, directly instantiating write client using DataSourceUtils.createHoodieClient + // will hit a code which tries to instantiate meta client for data table. if table does not exist, it fails. + // hence doing an explicit instantiation here. + val tableMetaClientBuilder = HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(hoodieFooTableName) + .setRecordKeyFields(fooTableParams(DataSourceWriteOptions.RECORDKEY_FIELD.key)) + .setBaseFileFormat(HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().name()) + .setArchiveLogFolder(HoodieTableConfig.ARCHIVELOG_FOLDER.defaultValue()) + .setPayloadClassName(fooTableParams(PAYLOAD_CLASS_NAME.key)) + .setPreCombineField(fooTableParams(PRECOMBINE_FIELD.key)) + .setPartitionFields(fooTableParams(DataSourceWriteOptions.PARTITIONPATH_FIELD.key)) + .setKeyGeneratorClassProp(fooTableParams(KEYGENERATOR_CLASS_NAME.key)) + if(addBootstrapPath) { + tableMetaClientBuilder + .setBootstrapBasePath(fooTableParams(HoodieBootstrapConfig.BASE_PATH.key)) + } + tableMetaClientBuilder.initTable(sc.hadoopConfiguration, tempBasePath) + } + /** * Test cases for schema evolution in different types of tables. * @@ -533,7 +551,6 @@ class HoodieSparkSqlWriterSuite { //create a new table val fooTableModifier = getCommonParams(tempPath, hoodieFooTableName, tableType) .updated(DataSourceWriteOptions.RECONCILE_SCHEMA.key, "true") - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) // generate the inserts val schema = DataSourceTestUtils.getStructTypeExampleSchema @@ -541,7 +558,7 @@ class HoodieSparkSqlWriterSuite { var records = DataSourceTestUtils.generateRandomRows(10) var recordsSeq = convertRowListToSeq(records) val df1 = spark.createDataFrame(sc.parallelize(recordsSeq), structType) - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, fooTableParams, df1) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, fooTableModifier, df1) val snapshotDF1 = spark.read.format("org.apache.hudi") .load(tempBasePath + "/*/*/*/*") @@ -554,7 +571,7 @@ class HoodieSparkSqlWriterSuite { // issue updates so that log files are created for MOR table val updatesSeq = convertRowListToSeq(DataSourceTestUtils.generateUpdates(records, 5)) val updatesDf = spark.createDataFrame(sc.parallelize(updatesSeq), structType) - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, updatesDf) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, updatesDf) val snapshotDF2 = spark.read.format("org.apache.hudi") .load(tempBasePath + "/*/*/*/*") @@ -572,7 +589,7 @@ class HoodieSparkSqlWriterSuite { recordsSeq = convertRowListToSeq(records) val df3 = spark.createDataFrame(sc.parallelize(recordsSeq), evolStructType) // write to Hudi with new column - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df3) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df3) val snapshotDF3 = spark.read.format("org.apache.hudi") .load(tempBasePath + "/*/*/*/*") @@ -587,7 +604,7 @@ class HoodieSparkSqlWriterSuite { records = DataSourceTestUtils.generateRandomRows(10) recordsSeq = convertRowListToSeq(records) val df4 = spark.createDataFrame(sc.parallelize(recordsSeq), structType) - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df4) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df4) val snapshotDF4 = spark.read.format("org.apache.hudi") .load(tempBasePath + "/*/*/*/*") @@ -720,14 +737,13 @@ class HoodieSparkSqlWriterSuite { @ValueSource(booleans = Array(true, false)) def testDeletePartitionsV2(usePartitionsToDeleteConfig: Boolean): Unit = { val fooTableModifier = getCommonParams(tempPath, hoodieFooTableName, HoodieTableType.COPY_ON_WRITE.name()) - val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) val schema = DataSourceTestUtils.getStructTypeExampleSchema val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) val records = DataSourceTestUtils.generateRandomRows(10) val recordsSeq = convertRowListToSeq(records) val df1 = spark.createDataFrame(sc.parallelize(recordsSeq), structType) // write to Hudi - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, fooTableParams, df1) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, fooTableModifier, df1) val snapshotDF1 = spark.read.format("org.apache.hudi") .load(tempBasePath + "/*/*/*/*") assertEquals(10, snapshotDF1.count()) @@ -738,7 +754,7 @@ class HoodieSparkSqlWriterSuite { val updatesSeq = convertRowListToSeq(DataSourceTestUtils.generateUpdates(records, 5)) val updatesDf = spark.createDataFrame(sc.parallelize(updatesSeq), structType) // write updates to Hudi - HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, updatesDf) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, updatesDf) val snapshotDF2 = spark.read.format("org.apache.hudi") .load(tempBasePath + "/*/*/*/*") assertEquals(10, snapshotDF2.count()) @@ -747,7 +763,7 @@ class HoodieSparkSqlWriterSuite { // ensure 2nd batch of updates matches. assert(updatesDf.intersect(trimmedDf2).except(updatesDf).count() == 0) if (usePartitionsToDeleteConfig) { - fooTableParams.updated(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key(), HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + fooTableModifier.updated(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key(), HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) } // delete partitions contains the primary key val recordsToDelete = df1.filter(entry => { @@ -755,7 +771,7 @@ class HoodieSparkSqlWriterSuite { partitionPath.equals(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) || partitionPath.equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) }) - val updatedParams = fooTableParams.updated(DataSourceWriteOptions.OPERATION.key(), WriteOperationType.DELETE_PARTITION.name()) + val updatedParams = fooTableModifier.updated(DataSourceWriteOptions.OPERATION.key(), WriteOperationType.DELETE_PARTITION.name()) HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, updatedParams, recordsToDelete) val snapshotDF3 = spark.read.format("org.apache.hudi") .load(tempBasePath + "/*/*/*/*") @@ -796,4 +812,88 @@ class HoodieSparkSqlWriterSuite { assert(spark.read.format("hudi").load(tempBasePath).where("age >= 2000").count() == 10) } } + + /** + * Test case for no need to specify hiveStylePartitioning/urlEncodePartitioning/KeyGenerator included in HoodieTableConfig except the first time write + */ + @Test + def testToWriteWithoutParametersIncludedInHoodieTableConfig(): Unit = { + val _spark = spark + import _spark.implicits._ + val df = Seq((1, "a1", 10, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + val options = Map( + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "id", + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "ts", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "dt" + ) + + // case 1: test table which created by sql + val (tableName1, tablePath1) = ("hoodie_test_params_1", s"$tempBasePath" + "_1") + spark.sql( + s""" + | create table $tableName1 ( + | id int, + | name string, + | price double, + | ts long, + | dt string + | ) using hudi + | partitioned by (dt) + | options ( + | primaryKey = 'id' + | ) + | location '$tablePath1' + """.stripMargin) + val tableConfig1 = HoodieTableMetaClient.builder() + .setConf(spark.sparkContext.hadoopConfiguration) + .setBasePath(tablePath1).build().getTableConfig + assert(tableConfig1.getHiveStylePartitioningEnable == "true") + assert(tableConfig1.getUrlEncodePartitoning == "false") + assert(tableConfig1.getKeyGeneratorClassName == classOf[ComplexKeyGenerator].getName) + df.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName1) + .mode(SaveMode.Append).save(tablePath1) + assert(spark.read.format("hudi").load(tablePath1 + "/*").count() == 1) + + // case 2: test table which created by dataframe + val (tableName2, tablePath2) = ("hoodie_test_params_2", s"$tempBasePath" + "_2") + // the first write need to specify params + df.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName2) + .option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, "true") + .option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, classOf[SimpleKeyGenerator].getName) + .mode(SaveMode.Overwrite).save(tablePath2) + val tableConfig2 = HoodieTableMetaClient.builder() + .setConf(spark.sparkContext.hadoopConfiguration) + .setBasePath(tablePath2).build().getTableConfig + assert(tableConfig2.getHiveStylePartitioningEnable == "false") + assert(tableConfig2.getUrlEncodePartitoning == "true") + assert(tableConfig2.getKeyGeneratorClassName == classOf[SimpleKeyGenerator].getName) + + val df2 = Seq((2, "a2", 20, 1000, "2021-10-16")).toDF("id", "name", "value", "ts", "dt") + // raise exception when use params which is not same with HoodieTableConfig + try { + df2.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName2) + .option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, classOf[ComplexKeyGenerator].getName) + .mode(SaveMode.Append).save(tablePath2) + } catch { + case NonFatal(e) => + assert(e.getMessage.contains("Config conflict")) + assert(e.getMessage.contains( + s"${HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key}\t${classOf[ComplexKeyGenerator].getName}\t${classOf[SimpleKeyGenerator].getName}")) + } + + // do not need to specify hiveStylePartitioning/urlEncodePartitioning/KeyGenerator params + df2.write.format("hudi") + .options(options) + .option(HoodieWriteConfig.TBL_NAME.key, tableName2) + .mode(SaveMode.Append).save(tablePath2) + val data = spark.read.format("hudi").load(tablePath2 + "/*") + assert(data.count() == 2) + assert(data.select("_hoodie_partition_path").map(_.getString(0)).distinct.collect.head == "dt=2021-10-16") + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala index 94e9620d0946f..7c58cc07ea38b 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala @@ -17,6 +17,8 @@ package org.apache.hudi +import java.util.Properties + import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.table.HoodieTableMetaClient @@ -58,6 +60,7 @@ class TestHoodieFileIndex extends HoodieClientTestBase { ) @BeforeEach override def setUp() { + setTableName("hoodie_test") initPath() initSparkContexts() spark = sqlContext.sparkSession @@ -71,6 +74,9 @@ class TestHoodieFileIndex extends HoodieClientTestBase { @ParameterizedTest @ValueSource(booleans = Array(true, false)) def testPartitionSchema(partitionEncode: Boolean): Unit = { + val props = new Properties() + props.setProperty(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, String.valueOf(partitionEncode)) + initMetaClient(props) val records1 = dataGen.generateInsertsContainsAllPartitions("000", 100) val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2)) inputDF1.write.format("hudi") @@ -128,6 +134,9 @@ class TestHoodieFileIndex extends HoodieClientTestBase { @ParameterizedTest @ValueSource(booleans = Array(true, false)) def testPartitionPruneWithPartitionEncode(partitionEncode: Boolean): Unit = { + val props = new Properties() + props.setProperty(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, String.valueOf(partitionEncode)) + initMetaClient(props) val partitions = Array("2021/03/08", "2021/03/09", "2021/03/10", "2021/03/11", "2021/03/12") val newDataGen = new HoodieTestDataGenerator(partitions) val records1 = newDataGen.generateInsertsContainsAllPartitions("000", 100) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala index b86eade9bb951..1b756b5e2365f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala @@ -41,14 +41,18 @@ class TestHoodieSparkUtils { def testGlobPaths(@TempDir tempDir: File): Unit = { val folders: Seq[Path] = Seq( new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri), - new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri) + new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie", "metadata").toUri) ) val files: Seq[Path] = Seq( new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri), new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri), new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri), - new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri) + new Path(Paths.get(tempDir.getAbsolutePath, "folder2","file4").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie","metadata", "file5").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie","metadata", "file6").toUri) ) folders.foreach(folder => new File(folder.toUri).mkdir()) @@ -57,12 +61,14 @@ class TestHoodieSparkUtils { var paths = Seq(tempDir.getAbsolutePath + "/*") var globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths, new Path(paths.head).getFileSystem(new Configuration())) - assertEquals(folders.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString)) + assertEquals(folders.filterNot(entry => entry.toString.contains(".hoodie")) + .sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString)) paths = Seq(tempDir.getAbsolutePath + "/*/*") globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths, new Path(paths.head).getFileSystem(new Configuration())) - assertEquals(files.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString)) + assertEquals(files.filterNot(entry => entry.toString.contains(".hoodie")) + .sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString)) paths = Seq(tempDir.getAbsolutePath + "/folder1/*") globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths, @@ -79,7 +85,8 @@ class TestHoodieSparkUtils { paths = Seq(tempDir.getAbsolutePath + "/folder1/*", tempDir.getAbsolutePath + "/folder2/*") globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths, new Path(paths.head).getFileSystem(new Configuration())) - assertEquals(files.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString)) + assertEquals(files.filterNot(entry => entry.toString.contains(".hoodie")) + .sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString)) } @Test diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala new file mode 100644 index 0000000000000..06ac600b0346e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestOptimizeTable.scala @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.functional + +import java.sql.{Date, Timestamp} + +import org.apache.hadoop.fs.Path +import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} +import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions} +import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings +import org.apache.hudi.testutils.HoodieClientTestBase +import org.apache.spark.ZCurveOptimizeHelper +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} +import org.junit.jupiter.params.ParameterizedTest +import org.junit.jupiter.params.provider.ValueSource + +import scala.collection.JavaConversions._ +import scala.util.Random + +class TestOptimizeTable extends HoodieClientTestBase { + var spark: SparkSession = null + + val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + "hoodie.bulkinsert.shuffle.parallelism" -> "4", + DataSourceWriteOptions.RECORDKEY_FIELD.key() -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key() -> "partition", + DataSourceWriteOptions.PRECOMBINE_FIELD.key() -> "timestamp", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" + ) + + @BeforeEach override def setUp() { + initPath() + initSparkContexts() + spark = sqlContext.sparkSession + initTestDataGenerator() + initFileSystem() + } + + @AfterEach override def tearDown() = { + cleanupSparkContexts() + cleanupTestDataGenerator() + cleanupFileSystem() + } + + @ParameterizedTest + @ValueSource(strings = Array("COPY_ON_WRITE", "MERGE_ON_READ")) + def testOptimizewithClustering(tableType: String): Unit = { + // Bulk Insert Operation + val records1 = recordsToStrings(dataGen.generateInserts("001", 1000)).toList + val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2)) + inputDF1.write.format("org.apache.hudi") + .options(commonOpts) + .option("hoodie.compact.inline", "false") + .option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) + .option(DataSourceWriteOptions.TABLE_TYPE.key(), tableType) + // option for clustering + .option("hoodie.parquet.small.file.limit", "0") + .option("hoodie.clustering.inline", "true") + .option("hoodie.clustering.inline.max.commits", "1") + .option("hoodie.clustering.plan.strategy.target.file.max.bytes", "1073741824") + .option("hoodie.clustering.plan.strategy.small.file.limit", "629145600") + .option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString) + .option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 *1024 * 1024L)) + .option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE.key, "true") + .option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat, begin_lon") + .mode(SaveMode.Overwrite) + .save(basePath) + + assertEquals(1000, spark.read.format("hudi").load(basePath).count()) + assertEquals(1000, + spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true").format("hudi").load(basePath).count()) + } + + @Test + def testCollectMinMaxStatistics(): Unit = { + val testPath = new Path(System.getProperty("java.io.tmpdir"), "minMax") + val statisticPath = new Path(System.getProperty("java.io.tmpdir"), "stat") + val fs = testPath.getFileSystem(spark.sparkContext.hadoopConfiguration) + try { + val complexDataFrame = createComplexDataFrame(spark) + complexDataFrame.repartition(3).write.mode("overwrite").save(testPath.toString) + val df = spark.read.load(testPath.toString) + // do not support TimeStampType, so if we collect statistics for c4, should throw exception + val colDf = ZCurveOptimizeHelper.getMinMaxValue(df, "c1,c2,c3,c5,c6,c7,c8") + colDf.cache() + assertEquals(colDf.count(), 3) + assertEquals(colDf.take(1)(0).length, 22) + colDf.unpersist() + // try to save statistics + ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c2,c3,c5,c6,c7,c8", statisticPath.toString, "2", Seq("0", "1")) + // save again + ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c2,c3,c5,c6,c7,c8", statisticPath.toString, "3", Seq("0", "1", "2")) + // test old index table clean + ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c2,c3,c5,c6,c7,c8", statisticPath.toString, "4", Seq("0", "1", "3")) + assertEquals(!fs.exists(new Path(statisticPath, "2")), true) + assertEquals(fs.exists(new Path(statisticPath, "3")), true) + } finally { + if (fs.exists(testPath)) fs.delete(testPath) + if (fs.exists(statisticPath)) fs.delete(statisticPath) + } + } + + def createComplexDataFrame(spark: SparkSession): DataFrame = { + val schema = new StructType() + .add("c1", IntegerType) + .add("c2", StringType) + .add("c3", DecimalType(9,3)) + .add("c4", TimestampType) + .add("c5", ShortType) + .add("c6", DateType) + .add("c7", BinaryType) + .add("c8", ByteType) + + val rdd = spark.sparkContext.parallelize(0 to 1000, 1).map { item => + val c1 = Integer.valueOf(item) + val c2 = s" ${item}sdc" + val c3 = new java.math.BigDecimal(s"${Random.nextInt(1000)}.${item}") + val c4 = new Timestamp(System.currentTimeMillis()) + val c5 = java.lang.Short.valueOf(s"${(item + 16) /10}") + val c6 = Date.valueOf(s"${2020}-${item % 11 + 1}-${item % 28 + 1}") + val c7 = Array(item).map(_.toByte) + val c8 = java.lang.Byte.valueOf("9") + + RowFactory.create(c1, c2, c3, c4, c5, c6, c7, c8) + } + spark.createDataFrame(rdd, schema) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala index bfd4423db16a1..663493438a9ef 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala @@ -17,6 +17,7 @@ package org.apache.hudi.functional +import org.apache.hadoop.fs.FileSystem import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.table.timeline.HoodieInstant import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} @@ -74,6 +75,8 @@ class TestCOWDataSource extends HoodieClientTestBase { cleanupSparkContexts() cleanupTestDataGenerator() cleanupFileSystem() + FileSystem.closeAll() + System.gc() } @Test def testShortNameStorage() { @@ -368,7 +371,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val recordsDF = spark.createDataFrame(rdd, schema) recordsDF.write.format("org.apache.hudi") .options(commonOpts) - .mode(SaveMode.Append) + .mode(SaveMode.Overwrite) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") @@ -401,11 +404,15 @@ class TestCOWDataSource extends HoodieClientTestBase { } private def getDataFrameWriter(keyGenerator: String): DataFrameWriter[Row] = { + getDataFrameWriter(keyGenerator, true) + } + + private def getDataFrameWriter(keyGenerator: String, enableMetadata: Boolean): DataFrameWriter[Row] = { val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) - + val opts = commonOpts ++ Map(HoodieMetadataConfig.ENABLE.key() -> String.valueOf(enableMetadata)) inputDF.write.format("hudi") - .options(commonOpts) + .options(opts) .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, keyGenerator) .mode(SaveMode.Overwrite) } @@ -414,6 +421,7 @@ class TestCOWDataSource extends HoodieClientTestBase { // Without fieldType, the default is SIMPLE var writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) writer.partitionBy("current_ts") + .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") .load(basePath + "/*/*") @@ -424,6 +432,7 @@ class TestCOWDataSource extends HoodieClientTestBase { writer.partitionBy("current_ts:TIMESTAMP") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") + .mode(SaveMode.Overwrite) .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") .load(basePath + "/*/*") @@ -431,10 +440,11 @@ class TestCOWDataSource extends HoodieClientTestBase { assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= udf_date_format(col("current_ts"))).count() == 0) // Mixed fieldType - writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) + writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false) writer.partitionBy("driver", "rider:SIMPLE", "current_ts:TIMESTAMP") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") + .mode(SaveMode.Overwrite) .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") .load(basePath + "/*/*/*") @@ -442,7 +452,7 @@ class TestCOWDataSource extends HoodieClientTestBase { concat(col("driver"), lit("/"), col("rider"), lit("/"), udf_date_format(col("current_ts")))).count() == 0) // Test invalid partitionKeyType - writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) + writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false) writer = writer.partitionBy("current_ts:DUMMY") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") @@ -459,9 +469,10 @@ class TestCOWDataSource extends HoodieClientTestBase { // Use the `driver` field as the partition key var writer = getDataFrameWriter(classOf[SimpleKeyGenerator].getName) writer.partitionBy("driver") + .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*/*") + .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("driver")).count() == 0) // Use the `driver,rider` field as the partition key, If no such field exists, the default value `default` is used @@ -469,7 +480,7 @@ class TestCOWDataSource extends HoodieClientTestBase { writer.partitionBy("driver", "rider") .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*/*") + .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("default")).count() == 0) } @@ -477,9 +488,10 @@ class TestCOWDataSource extends HoodieClientTestBase { // Use the `driver` field as the partition key var writer = getDataFrameWriter(classOf[ComplexKeyGenerator].getName) writer.partitionBy("driver") + .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*/*") + .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("driver")).count() == 0) // Use the `driver`,`rider` field as the partition key @@ -487,7 +499,7 @@ class TestCOWDataSource extends HoodieClientTestBase { writer.partitionBy("driver", "rider") .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*/*") + .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= concat(col("driver"), lit("/"), col("rider"))).count() == 0) } @@ -496,6 +508,7 @@ class TestCOWDataSource extends HoodieClientTestBase { writer.partitionBy("current_ts") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") + .mode(SaveMode.Overwrite) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") @@ -507,10 +520,11 @@ class TestCOWDataSource extends HoodieClientTestBase { @Test def testSparkPartitonByWithGlobalDeleteKeyGenerator() { val writer = getDataFrameWriter(classOf[GlobalDeleteKeyGenerator].getName) writer.partitionBy("driver") + .mode(SaveMode.Overwrite) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*") + .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("")).count() == 0) } @@ -518,17 +532,19 @@ class TestCOWDataSource extends HoodieClientTestBase { // Empty string column var writer = getDataFrameWriter(classOf[NonpartitionedKeyGenerator].getName) writer.partitionBy("") + .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*") + .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("")).count() == 0) // Non-existent column writer = getDataFrameWriter(classOf[NonpartitionedKeyGenerator].getName) writer.partitionBy("abc") + .mode(SaveMode.Overwrite) .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*") + .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("")).count() == 0) } @@ -597,7 +613,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val recordsDF = spark.createDataFrame(rdd, schema1) recordsDF.write.format("org.apache.hudi") .options(opts) - .mode(SaveMode.Append) + .mode(SaveMode.Overwrite) .save(basePath) // 2. write records with schema2 add column age @@ -651,10 +667,10 @@ class TestCOWDataSource extends HoodieClientTestBase { val inputDF = spark.read.schema(schema1.toDDL).json(spark.sparkContext.parallelize(records, 2)) inputDF.write.format("org.apache.hudi") .options(opts) - .mode(SaveMode.Append) + .mode(SaveMode.Overwrite) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") - .load(basePath + "/*/*") + .load(basePath) val resultSchema = new StructType(recordsReadDF.schema.filter(p=> !p.name.startsWith("_hoodie")).toArray) assertEquals(resultSchema, schema1) } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestDataSourceForBootstrap.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestDataSourceForBootstrap.scala index 3d35c03e48dbc..d6ae80d09af58 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestDataSourceForBootstrap.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestDataSourceForBootstrap.scala @@ -137,7 +137,8 @@ class TestDataSourceForBootstrap { verifyIncrementalViewResult(commitInstantTime1, commitInstantTime2, isPartitioned = false, isHiveStylePartitioned = false) } - @Test def testMetadataBootstrapCOWHiveStylePartitioned(): Unit = { + @Test + def testMetadataBootstrapCOWHiveStylePartitioned(): Unit = { val timestamp = Instant.now.toEpochMilli val jsc = JavaSparkContext.fromSparkContext(spark.sparkContext) @@ -153,7 +154,9 @@ class TestDataSourceForBootstrap { // Perform bootstrap val commitInstantTime1 = runMetadataBootstrapAndVerifyCommit( - DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, Some("datestr")) + DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, + Some("datestr"), + Map(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key -> "true")) // Read bootstrapped table and verify count val hoodieROViewDF1 = spark.read.format("hudi").load(basePath + "/*") @@ -472,11 +475,13 @@ class TestDataSourceForBootstrap { } def runMetadataBootstrapAndVerifyCommit(tableType: String, - partitionColumns: Option[String] = None): String = { + partitionColumns: Option[String] = None, + extraOpts: Map[String, String] = Map.empty): String = { val bootstrapDF = spark.emptyDataFrame bootstrapDF.write .format("hudi") .options(commonOpts) + .options(extraOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, tableType) .option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, partitionColumns.getOrElse("")) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala index ee914aec5432e..eba2a3d24c827 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala @@ -17,6 +17,8 @@ package org.apache.hudi.functional +import java.util.Properties + import org.apache.hadoop.fs.Path import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.config.HoodieMetadataConfig @@ -61,6 +63,7 @@ class TestMORDataSource extends HoodieClientTestBase { val updatedVerificationVal: String = "driver_update" @BeforeEach override def setUp() { + setTableName("hoodie_test") initPath() initSparkContexts() spark = sqlContext.sparkSession diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestTimeTravelQuery.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestTimeTravelQuery.scala index bb102a4cd912e..c4af71768b167 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestTimeTravelQuery.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestTimeTravelQuery.scala @@ -46,6 +46,7 @@ class TestTimeTravelQuery extends HoodieClientTestBase { ) @BeforeEach override def setUp() { + setTableName("hoodie_test") initPath() initSparkContexts() spark = sqlContext.sparkSession @@ -217,13 +218,13 @@ class TestTimeTravelQuery extends HoodieClientTestBase { } private def defaultDateTimeFormat(queryInstant: String): String = { - val date = HoodieActiveTimeline.COMMIT_FORMATTER.parse(queryInstant) + val date = HoodieActiveTimeline.parseInstantTime(queryInstant) val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.format(date) } private def defaultDateFormat(queryInstant: String): String = { - val date = HoodieActiveTimeline.COMMIT_FORMATTER.parse(queryInstant) + val date = HoodieActiveTimeline.parseInstantTime(queryInstant) val format = new SimpleDateFormat("yyyy-MM-dd") format.format(date) } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala new file mode 100644 index 0000000000000..b2ada77c21941 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.keygen.{ComplexKeyGenerator, SimpleKeyGenerator} +import org.apache.spark.sql.SaveMode + +import scala.util.control.NonFatal + +class TestAlterTableDropPartition extends TestHoodieSqlBase { + + test("Drop non-partitioned table") { + val tableName = generateTableName + // create table + spark.sql( + s""" + | create table $tableName ( + | id bigint, + | name string, + | ts string, + | dt string + | ) + | using hudi + | options ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + |""".stripMargin) + // insert data + spark.sql(s"""insert into $tableName values (1, "z3", "v1", "2021-10-01"), (2, "l4", "v1", "2021-10-02")""") + + checkExceptionContain(s"alter table $tableName drop partition (dt='2021-10-01')")( + s"dt is not a valid partition column in table `default`.`$tableName`.") + } + + Seq(false, true).foreach { urlencode => + test(s"Drop single-partition table' partitions, urlencode: $urlencode") { + withTempDir { tmp => + val tableName = generateTableName + val tablePath = s"${tmp.getCanonicalPath}/$tableName" + + import spark.implicits._ + val df = Seq((1, "z3", "v1", "2021/10/01"), (2, "l4", "v1", "2021/10/02")) + .toDF("id", "name", "ts", "dt") + + df.write.format("hudi") + .option(HoodieWriteConfig.TBL_NAME.key, tableName) + .option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL) + .option(RECORDKEY_FIELD.key, "id") + .option(PRECOMBINE_FIELD.key, "ts") + .option(PARTITIONPATH_FIELD.key, "dt") + .option(URL_ENCODE_PARTITIONING.key(), urlencode) + .option(KEYGENERATOR_CLASS_NAME.key, classOf[SimpleKeyGenerator].getName) + .option(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "1") + .option(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "1") + .mode(SaveMode.Overwrite) + .save(tablePath) + + // register meta to spark catalog by creating table + spark.sql( + s""" + |create table $tableName using hudi + | options ( + | primaryKey = 'id', + | preCombineField = 'ts' + |) + |partitioned by (dt) + |location '$tablePath' + |""".stripMargin) + + // drop 2021-10-01 partition + spark.sql(s"alter table $tableName drop partition (dt='2021/10/01')") + + checkAnswer(s"select dt from $tableName") (Seq(s"2021/10/02")) + } + } + } + + test("Drop single-partition table' partitions created by sql") { + val tableName = generateTableName + // create table + spark.sql( + s""" + | create table $tableName ( + | id bigint, + | name string, + | ts string, + | dt string + | ) + | using hudi + | options ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + | partitioned by (dt) + |""".stripMargin) + // insert data + spark.sql(s"""insert into $tableName values (1, "z3", "v1", "2021-10-01"), (2, "l4", "v1", "2021-10-02")""") + + // specify duplicate partition columns + checkExceptionContain(s"alter table $tableName drop partition (dt='2021-10-01', dt='2021-10-02')")( + "Found duplicate keys 'dt'") + + // drop 2021-10-01 partition + spark.sql(s"alter table $tableName drop partition (dt='2021-10-01')") + + checkAnswer(s"select id, name, ts, dt from $tableName") (Seq(2, "l4", "v1", "2021-10-02")) + } + + Seq(false, true).foreach { hiveStyle => + test(s"Drop multi-level partitioned table's partitions, isHiveStylePartitioning: $hiveStyle") { + withTempDir { tmp => + val tableName = generateTableName + val tablePath = s"${tmp.getCanonicalPath}/$tableName" + + import spark.implicits._ + val df = Seq((1, "z3", "v1", "2021", "10", "01"), (2, "l4", "v1", "2021", "10","02")) + .toDF("id", "name", "ts", "year", "month", "day") + + df.write.format("hudi") + .option(HoodieWriteConfig.TBL_NAME.key, tableName) + .option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL) + .option(RECORDKEY_FIELD.key, "id") + .option(PRECOMBINE_FIELD.key, "ts") + .option(PARTITIONPATH_FIELD.key, "year,month,day") + .option(HIVE_STYLE_PARTITIONING.key, hiveStyle) + .option(KEYGENERATOR_CLASS_NAME.key, classOf[ComplexKeyGenerator].getName) + .option(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "1") + .option(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "1") + .mode(SaveMode.Overwrite) + .save(tablePath) + + // register meta to spark catalog by creating table + spark.sql( + s""" + |create table $tableName using hudi + | options ( + | primaryKey = 'id', + | preCombineField = 'ts' + |) + |partitioned by (year, month, day) + |location '$tablePath' + |""".stripMargin) + + // not specified all partition column + checkExceptionContain(s"alter table $tableName drop partition (year='2021', month='10')")( + "All partition columns need to be specified for Hoodie's dropping partition" + ) + // drop 2021-10-01 partition + spark.sql(s"alter table $tableName drop partition (year='2021', month='10', day='01')") + + checkAnswer(s"select id, name, ts, year, month, day from $tableName")( + Seq(2, "l4", "v1", "2021", "10", "02") + ) + } + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala index e35b9b703f4e1..5413bf4044892 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala @@ -18,13 +18,15 @@ package org.apache.spark.sql.hudi import java.io.File - import org.apache.log4j.Level +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.util.Utils import org.scalactic.source import org.scalatest.{BeforeAndAfterAll, FunSuite, Tag} +import java.util.TimeZone + class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll { org.apache.log4j.Logger.getRootLogger.setLevel(Level.WARN) @@ -34,6 +36,7 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll { dir } + TimeZone.setDefault(DateTimeUtils.getTimeZone("CTT")) protected lazy val spark: SparkSession = SparkSession.builder() .master("local[1]") .appName("hoodie sql test") @@ -43,6 +46,7 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll { .config("hoodie.upsert.shuffle.parallelism", "4") .config("hoodie.delete.shuffle.parallelism", "4") .config("spark.sql.warehouse.dir", sparkWareHouse.getCanonicalPath) + .config("spark.sql.session.timeZone", "CTT") .getOrCreate() private var tableId = 0 @@ -77,7 +81,7 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll { } protected def checkAnswer(sql: String)(expects: Seq[Any]*): Unit = { - assertResult(expects.map(row => Row(row: _*)).toArray)(spark.sql(sql).collect()) + assertResult(expects.map(row => Row(row: _*)).toArray.sortBy(_.toString()))(spark.sql(sql).collect().sortBy(_.toString())) } protected def checkException(sql: String)(errorMsg: String): Unit = { @@ -92,6 +96,19 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll { assertResult(true)(hasException) } + protected def checkExceptionContain(sql: String)(errorMsg: String): Unit = { + var hasException = false + try { + spark.sql(sql) + } catch { + case e: Throwable => + assertResult(true)(e.getMessage.contains(errorMsg)) + hasException = true + } + assertResult(true)(hasException) + } + + protected def removeQuotes(value: Any): Any = { value match { case s: String => s.stripPrefix("'").stripSuffix("'") diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMereIntoLogOnlyTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoLogOnlyTable.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMereIntoLogOnlyTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoLogOnlyTable.scala index b4492b5911534..d911ace62a23e 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMereIntoLogOnlyTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoLogOnlyTable.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.hudi import org.apache.hudi.testutils.DataSourceTestUtils -class TestMereIntoLogOnlyTable extends TestHoodieSqlBase { +class TestMergeIntoLogOnlyTable extends TestHoodieSqlBase { test("Test Query Log Only MOR Table") { withTempDir { tmp => diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable2.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable2.scala index 92a2c63ee617f..bf73251e947d7 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable2.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable2.scala @@ -353,19 +353,7 @@ class TestMergeIntoTable2 extends TestHoodieSqlBase { |""".stripMargin if (HoodieSqlUtils.isSpark3) { - checkException(mergeSql)( - "\nColumns aliases are not allowed in MERGE.(line 5, pos 5)\n\n" + - "== SQL ==\n\r\n" + - s" merge into $tableName\r\n" + - " using (\r\n" + - " select 1, 'a1', 10, 1000, '1'\r\n" + - " ) s0(id,name,price,ts,flag)\r\n" + - "-----^^^\n" + - s" on s0.id = $tableName.id\r\n" + - " when matched and flag = '1' then update set\r\n" + - " id = s0.id, name = s0.name, price = s0.price, ts = s0.ts\r\n" + - " when not matched and flag = '1' then insert *\r\n" - ) + checkExceptionContain(mergeSql)("Columns aliases are not allowed in MERGE") } else { spark.sql(mergeSql) checkAnswer(s"select id, name, price, ts from $tableName")( @@ -375,4 +363,184 @@ class TestMergeIntoTable2 extends TestHoodieSqlBase { } } + test("Test MergeInto When PrimaryKey And PreCombineField Of Source Table And Target Table Differ In Case Only") { + withTempDir { tmp => + val tableName = generateTableName + // Create table + spark.sql( + s""" + |create table $tableName ( + | id int, + | name string, + | price double, + | ts long + |) using hudi + | location '${tmp.getCanonicalPath}/$tableName' + | options ( + | primaryKey ='id', + | preCombineField = 'ts' + | ) + """.stripMargin) + + spark.sql( + s""" + | merge into $tableName + | using ( + | select 1 as ID, 'a1' as NAME, 10 as PRICE, 1000 as TS, '1' as FLAG + | ) s0 + | on s0.ID = $tableName.id + | when matched and FLAG = '1' then update set + | id = s0.ID, name = s0.NAME, price = s0.PRICE, ts = s0.TS + | when not matched and FLAG = '1' then insert * + |""".stripMargin) + checkAnswer(s"select id, name, price, ts from $tableName")( + Seq(1, "a1", 10.0, 1000) + ) + + // Test the case of the column names of condition and action is different from that of source table + spark.sql( + s""" + | merge into $tableName + | using ( + | select 1 as ID, 'a1' as NAME, 11 as PRICE, 1001 as TS, '1' as FLAG + | ) s0 + | on s0.id = $tableName.id + | when matched and FLAG = '1' then update set + | id = s0.id, name = s0.NAME, price = s0.PRICE, ts = s0.ts + | when not matched and FLAG = '1' then insert * + |""".stripMargin) + checkAnswer(s"select id, name, price, ts from $tableName")( + Seq(1, "a1", 11.0, 1001) + ) + + // Test the case of the column names of cast condition is different from that of source table + spark.sql( + s""" + | merge into $tableName + | using ( + | select 2 as ID, 'a2' as NAME, 12 as PRICE, 1002 as TS, '1' as FLAG + | ) s0 + | on cast(s0.id as int) = $tableName.id + | when matched and FLAG = '1' then update set + | id = s0.id, name = s0.NAME, price = s0.PRICE, ts = s0.ts + | when not matched and FLAG = '1' then insert * + |""".stripMargin) + checkAnswer(s"select id, name, price, ts from $tableName")( + Seq(1, "a1", 11.0, 1001), + Seq(2, "a2", 12.0, 1002) + ) + } + } + + test("Test ignoring case") { + withTempDir { tmp => + val tableName = generateTableName + // Create table + spark.sql( + s""" + |create table $tableName ( + | ID int, + | name string, + | price double, + | TS long, + | DT string + |) using hudi + | location '${tmp.getCanonicalPath}/$tableName' + | options ( + | primaryKey ='ID', + | preCombineField = 'TS' + | ) + """.stripMargin) + + // First merge with a extra input field 'flag' (insert a new record) + spark.sql( + s""" + | merge into $tableName + | using ( + | select 1 as id, 'a1' as name, 10 as PRICE, 1000 as ts, '2021-05-05' as dt, '1' as flag + | ) s0 + | on s0.id = $tableName.id + | when matched and flag = '1' then update set + | id = s0.id, name = s0.name, PRICE = s0.price, ts = s0.ts, dt = s0.dt + | when not matched and flag = '1' then insert * + """.stripMargin) + checkAnswer(s"select id, name, price, ts, dt from $tableName")( + Seq(1, "a1", 10.0, 1000, "2021-05-05") + ) + + // Second merge (update the record) + spark.sql( + s""" + | merge into $tableName + | using ( + | select 1 as id, 'a1' as name, 20 as PRICE, '2021-05-05' as dt, 1001 as ts + | ) s0 + | on s0.id = $tableName.id + | when matched then update set + | id = s0.id, name = s0.name, PRICE = s0.price, ts = s0.ts, dt = s0.dt + | when not matched then insert * + """.stripMargin) + checkAnswer(s"select id, name, price, ts, dt from $tableName")( + Seq(1, "a1", 20.0, 1001, "2021-05-05") + ) + + // Test ignoring case when column name matches + spark.sql( + s""" + | merge into $tableName as t0 + | using ( + | select 1 as id, 'a1' as name, 1111 as ts, '2021-05-05' as dt, 111 as PRICE union all + | select 2 as id, 'a2' as name, 1112 as ts, '2021-05-05' as dt, 112 as PRICE + | ) as s0 + | on t0.id = s0.id + | when matched then update set * + | when not matched then insert * + |""".stripMargin) + checkAnswer(s"select id, name, price, ts, dt from $tableName")( + Seq(1, "a1", 111.0, 1111, "2021-05-05"), + Seq(2, "a2", 112.0, 1112, "2021-05-05") + ) + } + } + + test("Test ignoring case for MOR table") { + withTempDir { tmp => + val tableName = generateTableName + // Create a mor partitioned table. + spark.sql( + s""" + | create table $tableName ( + | ID int, + | NAME string, + | price double, + | TS long, + | dt string + | ) using hudi + | options ( + | type = 'mor', + | primaryKey = 'ID', + | preCombineField = 'TS' + | ) + | partitioned by(dt) + | location '${tmp.getCanonicalPath}/$tableName' + """.stripMargin) + + // Test ignoring case when column name matches + spark.sql( + s""" + | merge into $tableName as t0 + | using ( + | select 1 as id, 'a1' as NAME, 1111 as ts, '2021-05-05' as DT, 111 as price + | ) as s0 + | on t0.id = s0.id + | when matched then update set * + | when not matched then insert * + """.stripMargin + ) + checkAnswer(s"select id, name, price, ts, dt from $tableName")( + Seq(1, "a1", 111.0, 1111, "2021-05-05") + ) + } + } + } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala index 0dbb07466d4af..357954ebb1d57 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala @@ -98,7 +98,7 @@ class TestPartialUpdateForMergeInto extends TestHoodieSqlBase { | preCombineField = '_ts' |)""".stripMargin) - checkException( + checkExceptionContain( s""" |merge into $tableName t0 |using ( select 1 as id, 'a1' as name, 12 as price) s0 @@ -106,7 +106,7 @@ class TestPartialUpdateForMergeInto extends TestHoodieSqlBase { |when matched then update set price = s0.price """.stripMargin)( "Missing specify value for the preCombineField: _ts in merge-into update action. " + - "You should add '... update set _ts = xx....' to the when-matched clause.;") + "You should add '... update set _ts = xx....' to the when-matched clause.") val tableName2 = generateTableName spark.sql( @@ -123,7 +123,7 @@ class TestPartialUpdateForMergeInto extends TestHoodieSqlBase { | preCombineField = '_ts' |)""".stripMargin) - checkException( + checkExceptionContain( s""" |merge into $tableName2 t0 |using ( select 1 as id, 'a1' as name, 12 as price, 1000 as ts) s0 @@ -132,6 +132,6 @@ class TestPartialUpdateForMergeInto extends TestHoodieSqlBase { """.stripMargin)( "Missing specify the value for target field: 'id' in merge into update action for MOR table. " + "Currently we cannot support partial update for MOR, please complete all the target fields " + - "just like '...update set id = s0.id, name = s0.name ....';") + "just like '...update set id = s0.id, name = s0.name ....'") } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestShowPartitions.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestShowPartitions.scala new file mode 100644 index 0000000000000..05ee61c4879fd --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestShowPartitions.scala @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.spark.sql.Row + +class TestShowPartitions extends TestHoodieSqlBase { + + test("Test Show Non Partitioned Table's Partitions") { + val tableName = generateTableName + // Create a non-partitioned table + spark.sql( + s""" + | create table $tableName ( + | id int, + | name string, + | price double, + | ts long + |) using hudi + |options ( + | primaryKey = 'id', + | preCombineField = 'ts' + ) + """.stripMargin) + // Insert data + spark.sql( + s""" + | insert into $tableName + | select 1 as id, 'a1' as name, 10 as price, 1000 as ts + """.stripMargin) + checkAnswer(s"show partitions $tableName")(Seq.empty: _*) + } + + test("Test Show Partitioned Table's Partitions") { + val tableName = generateTableName + // Create a partitioned table + spark.sql( + s""" + | create table $tableName ( + | id int, + | name string, + | price double, + | ts long, + | dt string + ) using hudi + | partitioned by (dt) + | options ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + """.stripMargin) + // Empty partitions + checkAnswer(s"show partitions $tableName")(Seq.empty: _*) + + // Insert into dynamic partition + spark.sql( + s""" + | insert into $tableName + | values (1, 'a1', 10, 1000, '2021-01-01') + """.stripMargin) + checkAnswer(s"show partitions $tableName")(Seq("dt=2021-01-01")) + + // Insert into static partition + spark.sql( + s""" + | insert into $tableName partition(dt = '2021-01-02') + | select 2 as id, 'a2' as name, 10 as price, 1000 as ts + """.stripMargin) + checkAnswer(s"show partitions $tableName partition(dt='2021-01-02')")(Seq("dt=2021-01-02")) + + // Insert into null partition + spark.sql( + s""" + | insert into $tableName + | select 3 as id, 'a3' as name, 10 as price, 1000 as ts, null as dt + """.stripMargin) + checkAnswer(s"show partitions $tableName")( + Seq("dt=2021-01-01"), Seq("dt=2021-01-02"), Seq("dt=default") + ) + } + + test("Test Show Table's Partitions with MultiLevel Partitions") { + val tableName = generateTableName + // Create a multi-level partitioned table + spark.sql( + s""" + | create table $tableName ( + | id int, + | name string, + | price double, + | ts long, + | year string, + | month string, + | day string + | ) using hudi + | partitioned by (year, month, day) + | options ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + """.stripMargin) + // Empty partitions + checkAnswer(s"show partitions $tableName")(Seq.empty: _*) + + // Insert into dynamic partition + spark.sql( + s""" + | insert into $tableName + | values + | (1, 'a1', 10, 1000, '2021', '01', '01'), + | (2, 'a2', 10, 1000, '2021', '01', '02'), + | (3, 'a3', 10, 1000, '2021', '02', '01'), + | (4, 'a4', 10, 1000, '2021', '02', null), + | (5, 'a5', 10, 1000, '2021', null, '01'), + | (6, 'a6', 10, 1000, null, '01', '02'), + | (7, 'a6', 10, 1000, '2022', null, null), + | (8, 'a6', 10, 1000, null, '01', null), + | (9, 'a6', 10, 1000, null, null, '01') + """.stripMargin) + + // check all partitions + checkAnswer(s"show partitions $tableName")( + Seq("year=2021/month=01/day=01"), + Seq("year=2021/month=01/day=02"), + Seq("year=2021/month=02/day=01"), + Seq("year=2021/month=02/day=default"), + Seq("year=2021/month=default/day=01"), + Seq("year=default/month=01/day=default"), + Seq("year=default/month=01/day=02"), + Seq("year=default/month=default/day=01"), + Seq("year=2022/month=default/day=default") + ) + + // check partial partitions + checkAnswer(s"show partitions $tableName partition(year='2021', month='01', day='01')")( + Seq("year=2021/month=01/day=01") + ) + checkAnswer(s"show partitions $tableName partition(year='2021', month='02')")( + Seq("year=2021/month=02/day=default"), + Seq("year=2021/month=02/day=01") + ) + checkAnswer(s"show partitions $tableName partition(day=01)")( + Seq("year=2021/month=02/day=01"), + Seq("year=2021/month=default/day=01"), + Seq("year=2021/month=01/day=01"), + Seq("year=default/month=default/day=01") + ) + } +} diff --git a/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java b/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java index 649440639e653..addbc899d7b55 100644 --- a/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java +++ b/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java @@ -64,7 +64,7 @@ public Optional createWriter(String writeUUID, StructType sche String tblName = options.get(HoodieWriteConfig.TBL_NAME.key()).get(); boolean populateMetaFields = options.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS.key(), Boolean.parseBoolean(HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())); - // 1st arg to createHooodieConfig is not really reuqired to be set. but passing it anyways. + // 1st arg to createHoodieConfig is not really required to be set. but passing it anyways. HoodieWriteConfig config = DataSourceUtils.createHoodieConfig(options.get(HoodieWriteConfig.AVRO_SCHEMA_STRING.key()).get(), path, tblName, options.asMap()); boolean arePartitionRecordsSorted = HoodieInternalConfig.getBulkInsertIsPartitionRecordsSorted( options.get(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED).isPresent() diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala index 9a3e8e3024311..d47e7fbb497b0 100644 --- a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala @@ -82,4 +82,8 @@ class Spark2Adapter extends SparkAdapter { override def createLike(left: Expression, right: Expression): Expression = { Like(left, right) } + + override def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] = { + throw new IllegalStateException(s"Should not call ParserInterface#parseMultipartIdentifier for spark2") + } } diff --git a/hudi-spark-datasource/hudi-spark3/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java b/hudi-spark-datasource/hudi-spark3/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java index 1161088099bb8..eda8faead986f 100644 --- a/hudi-spark-datasource/hudi-spark3/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java +++ b/hudi-spark-datasource/hudi-spark3/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java @@ -53,7 +53,7 @@ public Table getTable(StructType schema, Transform[] partitioning, Map> partition, Seq userSpecifiedCols, + LogicalPlan query, boolean overwrite, boolean ifPartitionNotExists) { + try { + if (isSpark30) { + Constructor constructor = InsertIntoStatement.class.getConstructor( + LogicalPlan.class, Map.class, LogicalPlan.class, boolean.class, boolean.class); + return constructor.newInstance(table, partition, query, overwrite, ifPartitionNotExists); + } else { + Constructor constructor = InsertIntoStatement.class.getConstructor( + LogicalPlan.class, Map.class, Seq.class, LogicalPlan.class, boolean.class, boolean.class); + return constructor.newInstance(table, partition, userSpecifiedCols, query, overwrite, ifPartitionNotExists); + } + } catch (Exception e) { + throw new RuntimeException("Error in create InsertIntoStatement", e); + } + } +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala index 4c9a06b3cf209..87d80d0b42bf0 100644 --- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala @@ -19,10 +19,13 @@ package org.apache.spark.sql.adapter import org.apache.hudi.Spark3RowSerDe import org.apache.hudi.client.utils.SparkRowSerDe +import org.apache.hudi.spark3.internal.ReflectUtil +import org.apache.spark.SPARK_VERSION import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, Like} +import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, Join, JoinHint, LogicalPlan} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} @@ -67,15 +70,16 @@ class Spark3Adapter extends SparkAdapter { override def getInsertIntoChildren(plan: LogicalPlan): Option[(LogicalPlan, Map[String, Option[String]], LogicalPlan, Boolean, Boolean)] = { plan match { - case InsertIntoStatement(table, partitionSpec, query, overwrite, ifPartitionNotExists) => - Some((table, partitionSpec, query, overwrite, ifPartitionNotExists)) - case _=> None + case insert: InsertIntoStatement => + Some((insert.table, insert.partitionSpec, insert.query, insert.overwrite, insert.ifPartitionNotExists)) + case _ => + None } } override def createInsertInto(table: LogicalPlan, partition: Map[String, Option[String]], query: LogicalPlan, overwrite: Boolean, ifPartitionNotExists: Boolean): LogicalPlan = { - InsertIntoStatement(table, partition, query, overwrite, ifPartitionNotExists) + ReflectUtil.createInsertInto(SPARK_VERSION.startsWith("3.0"), table, partition, Seq.empty[String], query, overwrite, ifPartitionNotExists) } override def createSparkParsePartitionUtil(conf: SQLConf): SparkParsePartitionUtil = { @@ -85,4 +89,8 @@ class Spark3Adapter extends SparkAdapter { override def createLike(left: Expression, right: Expression): Expression = { new Like(left, right) } + + override def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] = { + parser.parseMultipartIdentifier(sqlText) + } } diff --git a/hudi-spark-datasource/hudi-spark3/src/test/java/org/apache/hudi/spark3/internal/TestReflectUtil.java b/hudi-spark-datasource/hudi-spark3/src/test/java/org/apache/hudi/spark3/internal/TestReflectUtil.java new file mode 100644 index 0000000000000..284b2aaf1f81d --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/test/java/org/apache/hudi/spark3/internal/TestReflectUtil.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.spark3.internal; + +import org.apache.hudi.testutils.HoodieClientTestBase; + +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation; +import org.apache.spark.sql.catalyst.plans.logical.InsertIntoStatement; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Unit tests {@link ReflectUtil}. + */ +public class TestReflectUtil extends HoodieClientTestBase { + + @Test + public void testDataSourceWriterExtraCommitMetadata() throws Exception { + SparkSession spark = sqlContext.sparkSession(); + + String insertIntoSql = "insert into test_reflect_util values (1, 'z3', 1, '2021')"; + InsertIntoStatement statement = (InsertIntoStatement) spark.sessionState().sqlParser().parsePlan(insertIntoSql); + + InsertIntoStatement newStatment = ReflectUtil.createInsertInto( + spark.version().startsWith("3.0"), + statement.table(), + statement.partitionSpec(), + scala.collection.immutable.List.empty(), + statement.query(), + statement.overwrite(), + statement.ifPartitionNotExists()); + + Assertions.assertTrue( + ((UnresolvedRelation)newStatment.table()).multipartIdentifier().contains("test_reflect_util")); + + if (!spark.version().startsWith("3.0")) { + Assertions.assertTrue(newStatment.userSpecifiedCols().isEmpty()); + } + } +} diff --git a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java index 06408bf61f06e..d4d580fe276af 100644 --- a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java +++ b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java @@ -73,9 +73,6 @@ public class DLASyncConfig implements Serializable { @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata") public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; - @Parameter(names = {"--verify-metadata-file-listing"}, description = "Verify file listing from Hudi's metadata against file system") - public Boolean verifyMetadataFileListing = HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(); - @Parameter(names = {"--help", "-h"}, help = true) public Boolean help = false; @@ -97,7 +94,6 @@ public static DLASyncConfig copy(DLASyncConfig cfg) { newConfig.skipRTSync = cfg.skipRTSync; newConfig.useDLASyncHiveStylePartitioning = cfg.useDLASyncHiveStylePartitioning; newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata; - newConfig.verifyMetadataFileListing = cfg.verifyMetadataFileListing; newConfig.supportTimestamp = cfg.supportTimestamp; return newConfig; } @@ -110,7 +106,6 @@ public String toString() { + partitionValueExtractorClass + '\'' + ", assumeDatePartitioning=" + assumeDatePartitioning + ", useDLASyncHiveStylePartitioning=" + useDLASyncHiveStylePartitioning + ", useFileListingFromMetadata=" + useFileListingFromMetadata - + ", verifyMetadataFileListing=" + verifyMetadataFileListing + ", help=" + help + '}'; } } diff --git a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java index 786c7208513ba..bf0369ae2ee58 100644 --- a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java +++ b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java @@ -19,18 +19,17 @@ package org.apache.hudi.dla; import com.beust.jcommander.JCommander; -import java.util.HashMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.util.Option; import org.apache.hudi.dla.util.Utils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.InvalidTableException; -import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.hive.SchemaDifference; import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.sync.common.AbstractSyncHoodieClient; @@ -39,6 +38,7 @@ import org.apache.log4j.Logger; import org.apache.parquet.schema.MessageType; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; @@ -147,18 +147,14 @@ private void syncSchema(String tableName, boolean tableExists, boolean useRealTi // Check and sync schema if (!tableExists) { LOG.info("DLA table " + tableName + " is not found. Creating it"); - if (!useRealTimeInputFormat) { - String inputFormatClassName = HoodieParquetInputFormat.class.getName(); - hoodieDLAClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(), - ParquetHiveSerDe.class.getName(), new HashMap<>(), new HashMap<>()); - } else { - // Custom serde will not work with ALTER TABLE REPLACE COLUMNS - // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive - // /ql/exec/DDLTask.java#L3488 - String inputFormatClassName = HoodieParquetRealtimeInputFormat.class.getName(); - hoodieDLAClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(), - ParquetHiveSerDe.class.getName(), new HashMap<>(), new HashMap<>()); - } + + String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(HoodieFileFormat.PARQUET, useRealTimeInputFormat); + + // Custom serde will not work with ALTER TABLE REPLACE COLUMNS + // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive + // /ql/exec/DDLTask.java#L3488 + hoodieDLAClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(), + ParquetHiveSerDe.class.getName(), new HashMap<>(), new HashMap<>()); } else { // Check if the table schema has evolved Map tableSchema = hoodieDLAClient.getTableSchema(tableName); @@ -173,7 +169,7 @@ private void syncSchema(String tableName, boolean tableExists, boolean useRealTi } /** - * Syncs the list of storage parititions passed in (checks if the partition is in dla, if not adds it or if the + * Syncs the list of storage partitions passed in (checks if the partition is in dla, if not adds it or if the * partition path does not match, it updates the partition path). */ private void syncPartitions(String tableName, List writtenPartitionsSince) { diff --git a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java index 6af01194539e9..20f94f01ef0b3 100644 --- a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java +++ b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java @@ -71,7 +71,7 @@ public class HoodieDLAClient extends AbstractSyncHoodieClient { public HoodieDLAClient(DLASyncConfig syncConfig, FileSystem fs) { super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata, - syncConfig.verifyMetadataFileListing, false, fs); + false, fs); this.dlaConfig = syncConfig; try { this.partitionValueExtractor = diff --git a/hudi-sync/hudi-hive-sync/pom.xml b/hudi-sync/hudi-hive-sync/pom.xml index 8f4fedaed5e0f..23ba1f96971ea 100644 --- a/hudi-sync/hudi-hive-sync/pom.xml +++ b/hudi-sync/hudi-hive-sync/pom.xml @@ -90,11 +90,13 @@ org.apache.hadoop hadoop-common tests + test org.apache.hadoop hadoop-hdfs tests + test @@ -151,6 +153,12 @@ test + + org.apache.spark + spark-core_${scala.binary.version} + test + + org.eclipse.jetty.aggregate diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveMetastoreBasedLockProvider.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveMetastoreBasedLockProvider.java index a1279b2be898c..f5bca97c3d4ac 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveMetastoreBasedLockProvider.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveMetastoreBasedLockProvider.java @@ -149,7 +149,7 @@ public void close() { if (lock != null) { hiveClient.unlock(lock.getLockid()); } - hiveClient.close(); + Hive.closeCurrent(); } catch (Exception e) { LOG.error(generateLogStatement(org.apache.hudi.common.lock.LockState.FAILED_TO_RELEASE, generateLogSuffixString())); } diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java index 30d5a401b7a8b..0c2abdbf571ac 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java @@ -40,13 +40,13 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)") public String baseFileFormat = "PARQUET"; - @Parameter(names = {"--user"}, description = "Hive username", required = true) + @Parameter(names = {"--user"}, description = "Hive username") public String hiveUser; - @Parameter(names = {"--pass"}, description = "Hive password", required = true) + @Parameter(names = {"--pass"}, description = "Hive password") public String hivePass; - @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url", required = true) + @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url") public String jdbcUrl; @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true) @@ -89,9 +89,6 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata") public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; - @Parameter(names = {"--verify-metadata-file-listing"}, description = "Verify file listing from Hudi's metadata against file system") - public Boolean verifyMetadataFileListing = HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(); - @Parameter(names = {"--table-properties"}, description = "Table properties to hive table") public String tableProperties; @@ -137,7 +134,6 @@ public static HiveSyncConfig copy(HiveSyncConfig cfg) { newConfig.tableName = cfg.tableName; newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat; newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata; - newConfig.verifyMetadataFileListing = cfg.verifyMetadataFileListing; newConfig.supportTimestamp = cfg.supportTimestamp; newConfig.decodePartition = cfg.decodePartition; newConfig.tableProperties = cfg.tableProperties; @@ -169,7 +165,6 @@ public String toString() { + ", ignoreExceptions=" + ignoreExceptions + ", skipROSuffix=" + skipROSuffix + ", useFileListingFromMetadata=" + useFileListingFromMetadata - + ", verifyMetadataFileListing=" + verifyMetadataFileListing + ", tableProperties='" + tableProperties + '\'' + ", serdeProperties='" + serdeProperties + '\'' + ", help=" + help diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index 4ffb52eaa211e..6a1d930c5e89e 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -321,7 +321,7 @@ private Map getSparkSerdeProperties(boolean readAsOptimized) { } /** - * Syncs the list of storage parititions passed in (checks if the partition is in hive, if not adds it or if the + * Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the * partition path does not match, it updates the partition path). */ private void syncPartitions(String tableName, List writtenPartitionsSince) { diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java index 13e48f5126479..265ab750d5aee 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java @@ -62,10 +62,10 @@ public class HoodieHiveClient extends AbstractSyncHoodieClient { private final HiveSyncConfig syncConfig; public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { - super(cfg.basePath, cfg.assumeDatePartitioning, cfg.useFileListingFromMetadata, cfg.verifyMetadataFileListing, cfg.withOperationField, fs); + super(cfg.basePath, cfg.assumeDatePartitioning, cfg.useFileListingFromMetadata, cfg.withOperationField, fs); this.syncConfig = cfg; - // Support JDBC, HiveQL and metastore based implementations for backwards compatiblity. Future users should + // Support JDBC, HiveQL and metastore based implementations for backwards compatibility. Future users should // disable jdbc and depend on metastore client for all hive registrations try { if (!StringUtils.isNullOrEmpty(cfg.syncMode)) { @@ -295,7 +295,7 @@ public void close() { try { ddlExecutor.close(); if (client != null) { - client.close(); + Hive.closeCurrent(); client = null; } } catch (Exception e) { diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java index b31c62ec5acb1..37aa54abd33b8 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java @@ -18,6 +18,7 @@ package org.apache.hudi.hive.ddl; +import org.apache.hadoop.security.UserGroupInformation; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.StorageSchemes; import org.apache.hudi.hive.HiveSyncConfig; @@ -101,6 +102,7 @@ public void createTable(String tableName, MessageType storageSchema, String inpu Table newTb = new Table(); newTb.setDbName(syncConfig.databaseName); newTb.setTableName(tableName); + newTb.setOwner(UserGroupInformation.getCurrentUser().getShortUserName()); newTb.setCreateTime((int) System.currentTimeMillis()); StorageDescriptor storageDescriptor = new StorageDescriptor(); storageDescriptor.setCols(fieldSchema); diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java index f28c3e6b63bb2..1603191c66947 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java @@ -33,6 +33,7 @@ import java.sql.Statement; import java.util.HashMap; import java.util.Map; +import java.util.Objects; /** * This class offers DDL executor backed by the jdbc This class preserves the old useJDBC = true way of doing things. @@ -44,6 +45,9 @@ public class JDBCExecutor extends QueryBasedDDLExecutor { public JDBCExecutor(HiveSyncConfig config, FileSystem fs) { super(config, fs); + Objects.requireNonNull(config.jdbcUrl, "--jdbc-url option is required for jdbc sync mode"); + Objects.requireNonNull(config.hiveUser, "--user option is required for jdbc sync mode"); + Objects.requireNonNull(config.hivePass, "--pass option is required for jdbc sync mode"); this.config = config; createHiveConnection(config.jdbcUrl, config.hiveUser, config.hivePass); } diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncConfig.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncConfig.java index fd7dbd81b1e3b..4bafd854ae318 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncConfig.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncConfig.java @@ -38,7 +38,6 @@ public static GlobalHiveSyncConfig copy(GlobalHiveSyncConfig cfg) { newConfig.tableName = cfg.tableName; newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat; newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata; - newConfig.verifyMetadataFileListing = cfg.verifyMetadataFileListing; newConfig.supportTimestamp = cfg.supportTimestamp; newConfig.decodePartition = cfg.decodePartition; newConfig.batchSyncNum = cfg.batchSyncNum; diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java index fcb626eb32310..64043a5bb29f8 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java @@ -850,7 +850,7 @@ private void verifyOldParquetFileTest(HoodieHiveClient hiveClient, String emptyC "Hive Schema should match the table schema + partition field"); assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),"Table partitions should match the number of partitions we wrote"); assertEquals(emptyCommitTime, - hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),"The last commit that was sycned should be updated in the TBLPROPERTIES"); + hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),"The last commit that was synced should be updated in the TBLPROPERTIES"); // make sure correct schema is picked Schema schema = SchemaTestUtil.getSimpleSchema(); diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestParquet2SparkSchemaUtils.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestParquet2SparkSchemaUtils.java index e4debe181c70e..3ca31b04395a1 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestParquet2SparkSchemaUtils.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestParquet2SparkSchemaUtils.java @@ -36,7 +36,19 @@ public class TestParquet2SparkSchemaUtils { private final SparkToParquetSchemaConverter spark2ParquetConverter = new SparkToParquetSchemaConverter(new SQLConf()); - private final SparkSqlParser parser = new SparkSqlParser(new SQLConf()); + private final SparkSqlParser parser = createSqlParser(); + + private static SparkSqlParser createSqlParser() { + try { + return SparkSqlParser.class.getDeclaredConstructor(SQLConf.class).newInstance(new SQLConf()); + } catch (Exception ne) { + try { // For spark 3.1, there is no constructor with SQLConf, use the default constructor + return SparkSqlParser.class.getDeclaredConstructor().newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } @Test public void testConvertPrimitiveType() { diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/TestCluster.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/TestCluster.java index a5631d0442798..c059c63a6a6f9 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/TestCluster.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/TestCluster.java @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.metastore.RetryingMetaStoreClient; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hive.service.server.HiveServer2; import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.hadoop.ParquetWriter; @@ -265,7 +266,7 @@ public void startHiveServer2() { public void shutDown() { stopHiveServer2(); - client.close(); + Hive.closeCurrent(); hiveTestService.getHiveMetaStore().stop(); hdfsTestService.stop(); } diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java index 11ff74528dbe1..ce4720ac00907 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java @@ -54,17 +54,21 @@ public abstract class AbstractSyncHoodieClient { private final String basePath; private final boolean assumeDatePartitioning; private final boolean useFileListingFromMetadata; - private final boolean verifyMetadataFileListing; private final boolean withOperationField; + @Deprecated public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata, boolean verifyMetadataFileListing, boolean withOperationField, FileSystem fs) { + this(basePath, assumeDatePartitioning, useFileListingFromMetadata, withOperationField, fs); + } + + public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata, + boolean withOperationField, FileSystem fs) { this.metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); this.tableType = metaClient.getTableType(); this.basePath = basePath; this.assumeDatePartitioning = assumeDatePartitioning; this.useFileListingFromMetadata = useFileListingFromMetadata; - this.verifyMetadataFileListing = verifyMetadataFileListing; this.withOperationField = withOperationField; this.fs = fs; } @@ -76,7 +80,7 @@ public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, * @param inputFormatClass The input format class of this table. * @param outputFormatClass The output format class of this table. * @param serdeClass The serde class of this table. - * @param serdeProperties The serde properites of this table. + * @param serdeProperties The serde properties of this table. * @param tableProperties The table properties for this table. */ public abstract void createTable(String tableName, MessageType storageSchema, @@ -156,8 +160,7 @@ public List getPartitionsWrittenToSince(Option lastCommitTimeSyn if (!lastCommitTimeSynced.isPresent()) { LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs); HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); - return FSUtils.getAllPartitionPaths(engineContext, basePath, useFileListingFromMetadata, verifyMetadataFileListing, - assumeDatePartitioning); + return FSUtils.getAllPartitionPaths(engineContext, basePath, useFileListingFromMetadata, assumeDatePartitioning); } else { LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then"); return TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().getCommitsTimeline() diff --git a/hudi-timeline-service/pom.xml b/hudi-timeline-service/pom.xml index ec659964da276..011cf8d765892 100644 --- a/hudi-timeline-service/pom.xml +++ b/hudi-timeline-service/pom.xml @@ -143,6 +143,7 @@ org.apache.hadoop hadoop-common tests + test org.mortbay.jetty diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java index 8f7489249038b..2bd9ab43a747f 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java @@ -24,10 +24,11 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.util.Option; @@ -36,13 +37,13 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.jetbrains.annotations.TestOnly; import java.io.Serializable; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; public class HoodieClusteringJob { @@ -189,9 +190,9 @@ private String getSchemaFromLatestInstant() throws Exception { private int doCluster(JavaSparkContext jsc) throws Exception { String schemaStr = getSchemaFromLatestInstant(); try (SparkRDDWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) { - JavaRDD writeResponse = - client.cluster(cfg.clusteringInstantTime, true).getWriteStatuses(); - return UtilHelpers.handleErrors(jsc, cfg.clusteringInstantTime, writeResponse); + Option commitMetadata = client.cluster(cfg.clusteringInstantTime, true).getCommitMetadata(); + + return handleErrors(commitMetadata.get(), cfg.clusteringInstantTime); } } @@ -230,10 +231,22 @@ public int doScheduleAndCluster(JavaSparkContext jsc) throws Exception { LOG.info("The schedule instant time is " + instantTime.get()); LOG.info("Step 2: Do cluster"); - JavaRDD writeResponse = - (JavaRDD) client.cluster(instantTime.get(), true).getWriteStatuses(); - return UtilHelpers.handleErrors(jsc, instantTime.get(), writeResponse); + Option metadata = client.cluster(instantTime.get(), true).getCommitMetadata(); + return handleErrors(metadata.get(), instantTime.get()); } } + private int handleErrors(HoodieCommitMetadata metadata, String instantTime) { + List writeStats = metadata.getPartitionToWriteStats().entrySet().stream().flatMap(e -> + e.getValue().stream()).collect(Collectors.toList()); + long errorsCount = writeStats.stream().mapToLong(HoodieWriteStat::getTotalWriteErrors).sum(); + if (errorsCount == 0) { + LOG.info(String.format("Table imported into hoodie with %s instant time.", instantTime)); + return 0; + } + + LOG.error(String.format("Import failed with %d errors.", errorsCount)); + return -1; + } + } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java index 860e0ade71552..43e58d531ed0f 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java @@ -74,15 +74,11 @@ static class Config implements Serializable { @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata") public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; - - @Parameter(names = {"--verify-metadata-file-listing"}, description = "Verify file listing from Hudi's metadata against file system") - public Boolean verifyMetadataFileListing = HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(); } public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning, - final boolean useFileListingFromMetadata, - final boolean verifyMetadataFileListing) throws IOException { + final boolean useFileListingFromMetadata) throws IOException { FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration()); final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration()); final HoodieTableMetaClient tableMetadata = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir).build(); @@ -100,7 +96,7 @@ public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDi LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp)); - List partitions = FSUtils.getAllPartitionPaths(context, baseDir, useFileListingFromMetadata, verifyMetadataFileListing, shouldAssumeDatePartitioning); + List partitions = FSUtils.getAllPartitionPaths(context, baseDir, useFileListingFromMetadata, shouldAssumeDatePartitioning); if (partitions.size() > 0) { LOG.info(String.format("The job needs to copy %d partitions.", partitions.size())); @@ -194,8 +190,7 @@ public static void main(String[] args) throws IOException { // Copy HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); - copier.snapshot(jsc, cfg.basePath, cfg.outputPath, cfg.shouldAssumeDatePartitioning, cfg.useFileListingFromMetadata, - cfg.verifyMetadataFileListing); + copier.snapshot(jsc, cfg.basePath, cfg.outputPath, cfg.shouldAssumeDatePartitioning, cfg.useFileListingFromMetadata); // Stop the job jsc.stop(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java index 85f3d2d189b50..c2cfa390d08e8 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotExporter.java @@ -153,7 +153,7 @@ private Option getLatestCommitTimestamp(FileSystem fs, Config cfg) { } private List getPartitions(HoodieEngineContext engineContext, Config cfg) { - return FSUtils.getAllPartitionPaths(engineContext, cfg.sourceBasePath, true, false, false); + return FSUtils.getAllPartitionPaths(engineContext, cfg.sourceBasePath, true, false); } private void createSuccessTag(FileSystem fs, Config cfg) throws IOException { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index 53d68c3232a7f..6632dce86d953 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -323,9 +323,7 @@ public Pair>> readFromSource( // Retrieve the previous round checkpoints, if any Option resumeCheckpointStr = Option.empty(); if (commitTimelineOpt.isPresent()) { - // TODO: now not support replace action HUDI-1500 - Option lastCommit = commitTimelineOpt.get() - .filter(instant -> !instant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)).lastInstant(); + Option lastCommit = commitTimelineOpt.get().lastInstant(); if (lastCommit.isPresent()) { HoodieCommitMetadata commitMetadata = HoodieCommitMetadata .fromBytes(commitTimelineOpt.get().getInstantDetails(lastCommit.get()).get(), HoodieCommitMetadata.class); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java index a7bf353536bc8..7e49d9b88f69f 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java @@ -128,8 +128,8 @@ private void populateTableExecutionContextList(TypedProperties properties, Strin Helpers.deepCopyConfigs(config, cfg); String overriddenTargetBasePath = tableProperties.getString(Constants.TARGET_BASE_PATH_PROP, ""); cfg.targetBasePath = StringUtils.isNullOrEmpty(overriddenTargetBasePath) ? targetBasePath : overriddenTargetBasePath; - if (cfg.enableHiveSync && StringUtils.isNullOrEmpty(tableProperties.getString(DataSourceWriteOptions.HIVE_TABLE().key(), ""))) { - throw new HoodieException("Hive sync table field not provided!"); + if (cfg.enableMetaSync && StringUtils.isNullOrEmpty(tableProperties.getString(DataSourceWriteOptions.HIVE_TABLE().key(), ""))) { + throw new HoodieException("Meta sync table field not provided!"); } populateSchemaProviderProps(cfg, tableProperties); executionContext = new TableExecutionContext(); @@ -180,6 +180,7 @@ static String getTableWithDatabase(TableExecutionContext context) { static void deepCopyConfigs(Config globalConfig, HoodieDeltaStreamer.Config tableConfig) { tableConfig.enableHiveSync = globalConfig.enableHiveSync; + tableConfig.enableMetaSync = globalConfig.enableMetaSync; tableConfig.schemaProviderClassName = globalConfig.schemaProviderClassName; tableConfig.sourceOrderingField = globalConfig.sourceOrderingField; tableConfig.sourceClassName = globalConfig.sourceClassName; @@ -207,6 +208,11 @@ static void deepCopyConfigs(Config globalConfig, HoodieDeltaStreamer.Config tabl public static void main(String[] args) throws IOException { final Config config = new Config(); + + if (config.enableHiveSync) { + logger.warn("--enable-hive-sync will be deprecated in a future release; please use --enable-sync instead for Hive syncing"); + } + JCommander cmd = new JCommander(config, null, args); if (config.help || args.length == 0) { cmd.usage(); @@ -292,6 +298,9 @@ public static class Config implements Serializable { @Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive") public Boolean enableHiveSync = false; + @Parameter(names = {"--enable-sync"}, description = "Enable syncing meta") + public Boolean enableMetaSync = false; + @Parameter(names = {"--max-pending-compactions"}, description = "Maximum number of outstanding inflight/requested compactions. Delta Sync will not happen unless" + "outstanding compactions is less than this number") diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java index 53b2febccaa82..ac15897f5785c 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java @@ -95,8 +95,7 @@ private void setHostAddrFromSparkConf(SparkConf sparkConf) { public void run() throws IOException { JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); - List allPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, cfg.basePath, - cfg.useFileListingFromMetadata, cfg.verifyMetadataFileListing, true); + List allPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, cfg.basePath, cfg.useFileListingFromMetadata, true); Collections.shuffle(allPartitionPaths); List selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions) .collect(Collectors.toList()); @@ -308,9 +307,6 @@ public static class Config implements Serializable { @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata") public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; - @Parameter(names = {"--verify-metadata-file-listing"}, description = "Verify file listing from Hudi's metadata against file system") - public Boolean verifyMetadataFileListing = HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue(); - @Parameter(names = {"--help", "-h"}) public Boolean help = false; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java index cf9e905bc939b..39340d0982596 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java @@ -69,7 +69,11 @@ protected InputBatch> fetchNewData(Option lastCheckpoint private JavaRDD toRDD(OffsetRange[] offsetRanges) { return KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, - LocationStrategies.PreferConsistent()).map(x -> (String) x.value()); + LocationStrategies.PreferConsistent()).filter(x -> { + String msgValue = (String) x.value(); + //Filter null messages from Kafka to prevent Exceptions + return msgValue != null; + }).map(x -> (String) x.value()); } @Override diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ORCDFSSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ORCDFSSource.java new file mode 100644 index 0000000000000..942bae89eec42 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ORCDFSSource.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.sources.helpers.DFSPathSelector; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +/** + * DFS Source that reads ORC data. + */ +public class ORCDFSSource extends RowSource { + + private final DFSPathSelector pathSelector; + + public ORCDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider); + this.pathSelector = DFSPathSelector.createSourceSelector(props, this.sparkContext.hadoopConfiguration()); + } + + @Override + public Pair>, String> fetchNextBatch(Option lastCkptStr, long sourceLimit) { + Pair, String> selectPathsWithMaxModificationTime = + pathSelector.getNextFilePathsAndMaxModificationTime(sparkContext, lastCkptStr, sourceLimit); + return selectPathsWithMaxModificationTime.getLeft() + .map(pathStr -> Pair.of(Option.of(fromFiles(pathStr)), selectPathsWithMaxModificationTime.getRight())) + .orElseGet(() -> Pair.of(Option.empty(), selectPathsWithMaxModificationTime.getRight())); + } + + private Dataset fromFiles(String pathStr) { + return sparkSession.read().orc(pathStr.split(",")); + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java similarity index 85% rename from hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerBase.java rename to hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java index 5a1cfc3327e82..043b0a4e0ad1d 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java @@ -26,9 +26,8 @@ import org.apache.hudi.utilities.testutils.UtilitiesTestBase; import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import org.apache.spark.streaming.kafka010.KafkaTestUtils; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; @@ -38,7 +37,7 @@ import java.io.IOException; import java.util.Random; -public class TestHoodieDeltaStreamerBase extends UtilitiesTestBase { +public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase { static final Random RANDOM = new Random(); @@ -50,12 +49,16 @@ public class TestHoodieDeltaStreamerBase extends UtilitiesTestBase { static final String PROPS_FILENAME_TEST_INVALID = "test-invalid.properties"; static final String PROPS_FILENAME_TEST_CSV = "test-csv-dfs-source.properties"; static final String PROPS_FILENAME_TEST_PARQUET = "test-parquet-dfs-source.properties"; + static final String PROPS_FILENAME_TEST_ORC = "test-orc-dfs-source.properties"; static final String PROPS_FILENAME_TEST_JSON_KAFKA = "test-json-kafka-dfs-source.properties"; static final String PROPS_FILENAME_TEST_MULTI_WRITER = "test-multi-writer.properties"; static final String FIRST_PARQUET_FILE_NAME = "1.parquet"; + static final String FIRST_ORC_FILE_NAME = "1.orc"; static String PARQUET_SOURCE_ROOT; + static String ORC_SOURCE_ROOT; static String JSON_KAFKA_SOURCE_ROOT; static final int PARQUET_NUM_RECORDS = 5; + static final int ORC_NUM_RECORDS = 5; static final int CSV_NUM_RECORDS = 3; static final int JSON_KAFKA_NUM_RECORDS = 5; String kafkaCheckpointType = "string"; @@ -74,7 +77,6 @@ public class TestHoodieDeltaStreamerBase extends UtilitiesTestBase { static final String HOODIE_CONF_PARAM = "--hoodie-conf"; static final String HOODIE_CONF_VALUE1 = "hoodie.datasource.hive_sync.table=test_table"; static final String HOODIE_CONF_VALUE2 = "hoodie.datasource.write.recordkey.field=Field1,Field2,Field3"; - static final Logger LOG = LogManager.getLogger(TestHoodieDeltaStreamerBase.class); public static KafkaTestUtils testUtils; protected static String topicName; protected static String defaultSchemaProviderClassName = FilebasedSchemaProvider.class.getName(); @@ -84,11 +86,18 @@ public class TestHoodieDeltaStreamerBase extends UtilitiesTestBase { public static void initClass() throws Exception { UtilitiesTestBase.initClass(true); PARQUET_SOURCE_ROOT = dfsBasePath + "/parquetFiles"; + ORC_SOURCE_ROOT = dfsBasePath + "/orcFiles"; JSON_KAFKA_SOURCE_ROOT = dfsBasePath + "/jsonKafkaFiles"; testUtils = new KafkaTestUtils(); testUtils.setup(); topicName = "topic" + testNum; + prepareInitialConfigs(dfs, dfsBasePath, testUtils.brokerAddress()); + prepareParquetDFSFiles(PARQUET_NUM_RECORDS, PARQUET_SOURCE_ROOT); + prepareORCDFSFiles(ORC_NUM_RECORDS, ORC_SOURCE_ROOT); + } + + protected static void prepareInitialConfigs(FileSystem dfs, String dfsBasePath, String brokerAddress) throws IOException { // prepare the configs. UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/base.properties", dfs, dfsBasePath + "/base.properties"); UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/base.properties", dfs, dfsBasePath + "/config/base.properties"); @@ -109,7 +118,7 @@ public static void initClass() throws Exception { UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/short_trip_uber_config.properties", dfs, dfsBasePath + "/config/short_trip_uber_config.properties"); UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/clusteringjob.properties", dfs, dfsBasePath + "/clusteringjob.properties"); - writeCommonPropsToFile(); + writeCommonPropsToFile(dfs, dfsBasePath); // Properties used for the delta-streamer which incrementally pulls from upstream Hudi source table and writes to // downstream hudi table @@ -134,22 +143,20 @@ public static void initClass() throws Exception { UtilitiesTestBase.Helpers.savePropsToDFS(invalidProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_INVALID); TypedProperties props1 = new TypedProperties(); - populateAllCommonProps(props1); + populateAllCommonProps(props1, dfsBasePath, brokerAddress); UtilitiesTestBase.Helpers.savePropsToDFS(props1, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE1); TypedProperties properties = new TypedProperties(); - populateInvalidTableConfigFilePathProps(properties); + populateInvalidTableConfigFilePathProps(properties, dfsBasePath); UtilitiesTestBase.Helpers.savePropsToDFS(properties, dfs, dfsBasePath + "/" + PROPS_INVALID_TABLE_CONFIG_FILE); TypedProperties invalidHiveSyncProps = new TypedProperties(); invalidHiveSyncProps.setProperty("hoodie.deltastreamer.ingestion.tablesToBeIngested", "uber_db.dummy_table_uber"); invalidHiveSyncProps.setProperty("hoodie.deltastreamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/invalid_hive_sync_uber_config.properties"); UtilitiesTestBase.Helpers.savePropsToDFS(invalidHiveSyncProps, dfs, dfsBasePath + "/" + PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1); - - prepareParquetDFSFiles(PARQUET_NUM_RECORDS, PARQUET_SOURCE_ROOT); } - protected static void writeCommonPropsToFile() throws IOException { + protected static void writeCommonPropsToFile(FileSystem dfs, String dfsBasePath) throws IOException { TypedProperties props = new TypedProperties(); props.setProperty("include", "sql-transformer.properties"); props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); @@ -186,20 +193,20 @@ public void teardown() throws Exception { super.teardown(); } - private static void populateInvalidTableConfigFilePathProps(TypedProperties props) { + protected static void populateInvalidTableConfigFilePathProps(TypedProperties props, String dfsBasePath) { props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyyMMdd"); props.setProperty("hoodie.deltastreamer.ingestion.tablesToBeIngested", "uber_db.dummy_table_uber"); props.setProperty("hoodie.deltastreamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/invalid_uber_config.properties"); } - static void populateAllCommonProps(TypedProperties props) { - populateCommonProps(props); - populateCommonKafkaProps(props); + protected static void populateAllCommonProps(TypedProperties props, String dfsBasePath, String brokerAddress) { + populateCommonProps(props, dfsBasePath); + populateCommonKafkaProps(props, brokerAddress); populateCommonHiveProps(props); } - protected static void populateCommonProps(TypedProperties props) { + protected static void populateCommonProps(TypedProperties props, String dfsBasePath) { props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyyMMdd"); props.setProperty("hoodie.deltastreamer.ingestion.tablesToBeIngested", "short_trip_db.dummy_table_short_trip,uber_db.dummy_table_uber"); @@ -207,9 +214,9 @@ protected static void populateCommonProps(TypedProperties props) { props.setProperty("hoodie.deltastreamer.ingestion.short_trip_db.dummy_table_short_trip.configFile", dfsBasePath + "/config/short_trip_uber_config.properties"); } - protected static void populateCommonKafkaProps(TypedProperties props) { + protected static void populateCommonKafkaProps(TypedProperties props, String brokerAddress) { //Kafka source properties - props.setProperty("bootstrap.servers", testUtils.brokerAddress()); + props.setProperty("bootstrap.servers", brokerAddress); props.setProperty("auto.offset.reset", "earliest"); props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); @@ -247,4 +254,27 @@ protected static void prepareParquetDFSFiles(int numRecords, String baseParquetP dataGenerator.generateInserts("000", numRecords)), new Path(path)); } } + + protected static void prepareORCDFSFiles(int numRecords) throws IOException { + prepareORCDFSFiles(numRecords, ORC_SOURCE_ROOT); + } + + protected static void prepareORCDFSFiles(int numRecords, String baseORCPath) throws IOException { + prepareORCDFSFiles(numRecords, baseORCPath, FIRST_ORC_FILE_NAME, false, null, null); + } + + protected static void prepareORCDFSFiles(int numRecords, String baseORCPath, String fileName, boolean useCustomSchema, + String schemaStr, Schema schema) throws IOException { + String path = baseORCPath + "/" + fileName; + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + if (useCustomSchema) { + Helpers.saveORCToDFS(Helpers.toGenericRecords( + dataGenerator.generateInsertsAsPerSchema("000", numRecords, schemaStr), + schema), new Path(path), HoodieTestDataGenerator.ORC_TRIP_SCHEMA); + } else { + Helpers.saveORCToDFS(Helpers.toGenericRecords( + dataGenerator.generateInserts("000", numRecords)), new Path(path)); + } + } + } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java index 6d0141e407b88..28ba17efa9f46 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java @@ -230,7 +230,7 @@ public void testImportWithUpsert() throws IOException, ParseException { public List createInsertRecords(Path srcFolder) throws ParseException, IOException { Path srcFile = new Path(srcFolder.toString(), "file1.parquet"); - long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000; + long startTime = HoodieActiveTimeline.parseInstantTime("20170203000000").getTime() / 1000; List records = new ArrayList(); for (long recordNum = 0; recordNum < 96; recordNum++) { records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "0", "rider-" + recordNum, @@ -247,7 +247,7 @@ public List createInsertRecords(Path srcFolder) throws ParseExcep public List createUpsertRecords(Path srcFolder) throws ParseException, IOException { Path srcFile = new Path(srcFolder.toString(), "file1.parquet"); - long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000; + long startTime = HoodieActiveTimeline.parseInstantTime("20170203000000").getTime() / 1000; List records = new ArrayList(); // 10 for update for (long recordNum = 0; recordNum < 11; recordNum++) { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java index aab02da238dbf..014a0c140d62e 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java @@ -22,10 +22,12 @@ import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.common.config.DFSPropertiesConfiguration; import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -54,6 +56,7 @@ import org.apache.hudi.utilities.sources.InputBatch; import org.apache.hudi.utilities.sources.JdbcSource; import org.apache.hudi.utilities.sources.JsonKafkaSource; +import org.apache.hudi.utilities.sources.ORCDFSSource; import org.apache.hudi.utilities.sources.ParquetDFSSource; import org.apache.hudi.utilities.sources.TestDataSource; import org.apache.hudi.utilities.testutils.JdbcTestUtils; @@ -67,7 +70,9 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.kafka.common.errors.TopicExistsException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -84,7 +89,6 @@ import org.apache.spark.sql.types.StructField; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; @@ -98,10 +102,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.ConcurrentModificationException; import java.util.List; +import java.util.Map; import java.util.Properties; -import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; @@ -118,43 +121,16 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import static org.junit.jupiter.params.provider.Arguments.arguments; /** * Basic tests against {@link HoodieDeltaStreamer}, by issuing bulk_inserts, upserts, inserts. Check counts at the end. */ -public class TestHoodieDeltaStreamer extends TestHoodieDeltaStreamerBase { +public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase { private static final Logger LOG = LogManager.getLogger(TestHoodieDeltaStreamer.class); - protected static TypedProperties prepareMultiWriterProps(String propsFileName) throws IOException { - TypedProperties props = new TypedProperties(); - populateAllCommonProps(props); - - props.setProperty("include", "sql-transformer.properties"); - props.setProperty("hoodie.datasource.write.keygenerator.class", TestGenerator.class.getName()); - props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); - props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); - props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); - props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); - - props.setProperty("include", "base.properties"); - props.setProperty("hoodie.write.concurrency.mode", "optimistic_concurrency_control"); - props.setProperty("hoodie.cleaner.policy.failed.writes", "LAZY"); - props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider"); - props.setProperty("hoodie.write.lock.hivemetastore.database", "testdb1"); - props.setProperty("hoodie.write.lock.hivemetastore.table", "table1"); - props.setProperty("hoodie.write.lock.zookeeper.url", "127.0.0.1"); - props.setProperty("hoodie.write.lock.zookeeper.port", "2828"); - props.setProperty("hoodie.write.lock.wait_time_ms", "1200000"); - props.setProperty("hoodie.write.lock.num_retries", "10"); - props.setProperty("hoodie.write.lock.zookeeper.lock_key", "test_table"); - props.setProperty("hoodie.write.lock.zookeeper.base_path", "/test"); - - UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + propsFileName); - return props; - } - protected HoodieDeltaStreamer initialHoodieDeltaStreamer(String tableBasePath, int totalRecords, String asyncCluster) throws IOException { HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT); cfg.continuousMode = true; @@ -263,26 +239,32 @@ static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, S } static void assertRecordCount(long expected, String tablePath, SQLContext sqlContext) { + sqlContext.clearCache(); long recordCount = sqlContext.read().format("org.apache.hudi").load(tablePath).count(); assertEquals(expected, recordCount); } static List countsPerCommit(String tablePath, SQLContext sqlContext) { - return sqlContext.read().format("org.apache.hudi").load(tablePath).groupBy("_hoodie_commit_time").count() + sqlContext.clearCache(); + List rows = sqlContext.read().format("org.apache.hudi").load(tablePath) + .groupBy("_hoodie_commit_time").count() .sort("_hoodie_commit_time").collectAsList(); + return rows; } static void assertDistanceCount(long expected, String tablePath, SQLContext sqlContext) { + sqlContext.clearCache(); sqlContext.read().format("org.apache.hudi").load(tablePath).registerTempTable("tmp_trips"); long recordCount = - sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance is not NULL").count(); + sqlContext.sql("select * from tmp_trips where haversine_distance is not NULL").count(); assertEquals(expected, recordCount); } static void assertDistanceCountWithExactValue(long expected, String tablePath, SQLContext sqlContext) { + sqlContext.clearCache(); sqlContext.read().format("org.apache.hudi").load(tablePath).registerTempTable("tmp_trips"); long recordCount = - sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance = 1.0").count(); + sqlContext.sql("select * from tmp_trips where haversine_distance = 1.0").count(); assertEquals(expected, recordCount); } @@ -335,7 +317,7 @@ static void waitTillCondition(Function condition, Future dsFut boolean ret = false; while (!ret && !dsFuture.isDone()) { try { - Thread.sleep(3000); + Thread.sleep(5000); ret = condition.apply(true); } catch (Throwable error) { LOG.warn("Got error :", error); @@ -673,7 +655,7 @@ public void testSchemaEvolution(String tableType, boolean useUserProvidedSchema, // clean up and reinit UtilitiesTestBase.Helpers.deleteFileFromDfs(FSUtils.getFs(cfg.targetBasePath, jsc.hadoopConfiguration()), dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE); - writeCommonPropsToFile(); + writeCommonPropsToFile(dfs, dfsBasePath); defaultSchemaProviderClassName = FilebasedSchemaProvider.class.getName(); } @@ -687,21 +669,6 @@ public void testUpsertsMORContinuousMode() throws Exception { testUpsertsContinuousMode(HoodieTableType.MERGE_ON_READ, "continuous_mor"); } - @Test - public void testUpsertsCOWContinuousModeWithMultipleWriters() throws Exception { - testUpsertsContinuousModeWithMultipleWriters(HoodieTableType.COPY_ON_WRITE, "continuous_cow_mulitwriter"); - } - - @Test - public void testUpsertsMORContinuousModeWithMultipleWriters() throws Exception { - testUpsertsContinuousModeWithMultipleWriters(HoodieTableType.MERGE_ON_READ, "continuous_mor_mulitwriter"); - } - - @Test - public void testLatestCheckpointCarryOverWithMultipleWriters() throws Exception { - testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType.COPY_ON_WRITE, "continuous_cow_checkpoint"); - } - private void testUpsertsContinuousMode(HoodieTableType tableType, String tempDir) throws Exception { String tableBasePath = dfsBasePath + "/" + tempDir; // Keep it higher than batch-size to test continuous mode @@ -726,216 +693,7 @@ private void testUpsertsContinuousMode(HoodieTableType tableType, String tempDir }); } - private void testUpsertsContinuousModeWithMultipleWriters(HoodieTableType tableType, String tempDir) throws Exception { - // NOTE : Overriding the LockProvider to FileSystemBasedLockProviderTestClass since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts - String tableBasePath = dfsBasePath + "/" + tempDir; - // enable carrying forward latest checkpoint - TypedProperties props = prepareMultiWriterProps(PROPS_FILENAME_TEST_MULTI_WRITER); - props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); - props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); - UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER); - // Keep it higher than batch-size to test continuous mode - int totalRecords = 3000; - - HoodieDeltaStreamer.Config cfgIngestionJob = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, - Arrays.asList(TripsWithDistanceTransformer.class.getName()), PROPS_FILENAME_TEST_MULTI_WRITER, false); - cfgIngestionJob.continuousMode = true; - cfgIngestionJob.tableType = tableType.name(); - cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); - cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); - HoodieDeltaStreamer ingestionJob = new HoodieDeltaStreamer(cfgIngestionJob, jsc); - - // Prepare base dataset with some commits - deltaStreamerTestRunner(ingestionJob, cfgIngestionJob, (r) -> { - if (tableType.equals(HoodieTableType.MERGE_ON_READ)) { - TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, dfs); - TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, dfs); - } else { - TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, dfs); - } - TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext); - TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext); - return true; - }); - - // create a backfill job - HoodieDeltaStreamer.Config cfgBackfillJob = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, - Arrays.asList(TripsWithDistanceTransformer.class.getName()), PROPS_FILENAME_TEST_MULTI_WRITER, false); - cfgBackfillJob.continuousMode = false; - cfgBackfillJob.tableType = tableType.name(); - HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build(); - HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class); - cfgBackfillJob.checkpoint = commitMetadata.getMetadata(CHECKPOINT_KEY); - cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); - cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); - HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc); - - // re-init ingestion job to start sync service - HoodieDeltaStreamer ingestionJob2 = new HoodieDeltaStreamer(cfgIngestionJob, jsc); - - // run ingestion & backfill in parallel, create conflict and fail one - runJobsInParallel(tableBasePath, tableType, totalRecords, ingestionJob2, - cfgIngestionJob, backfillJob, cfgBackfillJob, true); - - // create new ingestion & backfill job config to generate only INSERTS to avoid conflict - props = prepareMultiWriterProps(PROPS_FILENAME_TEST_MULTI_WRITER); - props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); - props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); - props.setProperty("hoodie.test.source.generate.inserts", "true"); - UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER); - cfgBackfillJob = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, - Arrays.asList(TestIdentityTransformer.class.getName()), PROPS_FILENAME_TEST_MULTI_WRITER, false); - cfgBackfillJob.continuousMode = false; - cfgBackfillJob.tableType = tableType.name(); - meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build(); - timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - commitMetadata = HoodieCommitMetadata - .fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class); - cfgBackfillJob.checkpoint = commitMetadata.getMetadata(CHECKPOINT_KEY); - cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); - cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); - - cfgIngestionJob = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, - Arrays.asList(TestIdentityTransformer.class.getName()), PROPS_FILENAME_TEST_MULTI_WRITER, false); - cfgIngestionJob.continuousMode = true; - cfgIngestionJob.tableType = tableType.name(); - cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); - cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); - // re-init ingestion job - HoodieDeltaStreamer ingestionJob3 = new HoodieDeltaStreamer(cfgIngestionJob, jsc); - // re-init backfill job - HoodieDeltaStreamer backfillJob2 = new HoodieDeltaStreamer(cfgBackfillJob, jsc); - - // run ingestion & backfill in parallel, avoid conflict and succeed both - runJobsInParallel(tableBasePath, tableType, totalRecords, ingestionJob3, - cfgIngestionJob, backfillJob2, cfgBackfillJob, false); - } - - private void testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType tableType, String tempDir) throws Exception { - // NOTE : Overriding the LockProvider to FileSystemBasedLockProviderTestClass since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts - String tableBasePath = dfsBasePath + "/" + tempDir; - // enable carrying forward latest checkpoint - TypedProperties props = prepareMultiWriterProps(PROPS_FILENAME_TEST_MULTI_WRITER); - props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); - props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); - UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER); - // Keep it higher than batch-size to test continuous mode - int totalRecords = 3000; - - HoodieDeltaStreamer.Config cfgIngestionJob = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, - Arrays.asList(TripsWithDistanceTransformer.class.getName()), PROPS_FILENAME_TEST_MULTI_WRITER, false); - cfgIngestionJob.continuousMode = true; - cfgIngestionJob.tableType = tableType.name(); - cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); - cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); - HoodieDeltaStreamer ingestionJob = new HoodieDeltaStreamer(cfgIngestionJob, jsc); - - // Prepare base dataset with some commits - deltaStreamerTestRunner(ingestionJob, cfgIngestionJob, (r) -> { - if (tableType.equals(HoodieTableType.MERGE_ON_READ)) { - TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, dfs); - TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, dfs); - } else { - TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, dfs); - } - TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext); - TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext); - return true; - }); - - // create a backfill job with checkpoint from the first instant - HoodieDeltaStreamer.Config cfgBackfillJob = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, - Arrays.asList(TripsWithDistanceTransformer.class.getName()), PROPS_FILENAME_TEST_MULTI_WRITER, false); - cfgBackfillJob.continuousMode = false; - cfgBackfillJob.tableType = tableType.name(); - HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build(); - HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - HoodieCommitMetadata commitMetadataForFirstInstant = HoodieCommitMetadata - .fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class); - - // get current checkpoint after preparing base dataset with some commits - HoodieCommitMetadata commitMetadataForLastInstant = HoodieCommitMetadata - .fromBytes(timeline.getInstantDetails(timeline.lastInstant().get()).get(), HoodieCommitMetadata.class); - String lastCheckpointBeforeParallelBackfill = commitMetadataForLastInstant.getMetadata(CHECKPOINT_KEY); - - // run the backfill job, enable overriding checkpoint from the latest commit - props = prepareMultiWriterProps(PROPS_FILENAME_TEST_MULTI_WRITER); - props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); - props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); - props.setProperty("hoodie.write.meta.key.prefixes", CHECKPOINT_KEY); - UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER); - - // reset checkpoint to first instant to simulate a random checkpoint for backfill job - // checkpoint will move from 00000 to 00001 for this backfill job - cfgBackfillJob.checkpoint = commitMetadataForFirstInstant.getMetadata(CHECKPOINT_KEY); - cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); - cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); - HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc); - backfillJob.sync(); - - // check if the checkpoint is carried over - timeline = meta.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); - commitMetadataForLastInstant = HoodieCommitMetadata - .fromBytes(timeline.getInstantDetails(timeline.lastInstant().get()).get(), HoodieCommitMetadata.class); - String lastCheckpointAfterParallelBackfill = commitMetadataForLastInstant.getMetadata(CHECKPOINT_KEY); - Assertions.assertEquals(lastCheckpointBeforeParallelBackfill, lastCheckpointAfterParallelBackfill); - } - - private void runJobsInParallel(String tableBasePath, HoodieTableType tableType, int totalRecords, - HoodieDeltaStreamer ingestionJob, HoodieDeltaStreamer.Config cfgIngestionJob, HoodieDeltaStreamer backfillJob, - HoodieDeltaStreamer.Config cfgBackfillJob, boolean expectConflict) throws Exception { - ExecutorService service = Executors.newFixedThreadPool(2); - HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build(); - HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - String lastSuccessfulCommit = timeline.lastInstant().get().getTimestamp(); - // Condition for parallel ingestion job - Function conditionForRegularIngestion = (r) -> { - if (tableType.equals(HoodieTableType.MERGE_ON_READ)) { - TestHelpers.assertAtleastNDeltaCommitsAfterCommit(3, lastSuccessfulCommit, tableBasePath, dfs); - } else { - TestHelpers.assertAtleastNCompactionCommitsAfterCommit(3, lastSuccessfulCommit, tableBasePath, dfs); - } - TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext); - TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext); - return true; - }; - - try { - Future regularIngestionJobFuture = service.submit(() -> { - try { - deltaStreamerTestRunner(ingestionJob, cfgIngestionJob, conditionForRegularIngestion); - } catch (Exception ex) { - throw new RuntimeException(ex); - } - }); - Future backfillJobFuture = service.submit(() -> { - try { - backfillJob.sync(); - } catch (Exception ex) { - throw new RuntimeException(ex); - } - }); - backfillJobFuture.get(); - regularIngestionJobFuture.get(); - if (expectConflict) { - Assertions.fail("Failed to handle concurrent writes"); - } - } catch (Exception e) { - /** - * Need to perform getMessage().contains since the exception coming - * from {@link org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.DeltaSyncService} gets wrapped many times into RuntimeExceptions. - */ - if (expectConflict && e.getCause().getMessage().contains(ConcurrentModificationException.class.getName())) { - // expected ConcurrentModificationException since ingestion & backfill will have overlapping writes - } else { - throw e; - } - } - } - - private void deltaStreamerTestRunner(HoodieDeltaStreamer ds, HoodieDeltaStreamer.Config cfg, Function condition) throws Exception { + static void deltaStreamerTestRunner(HoodieDeltaStreamer ds, HoodieDeltaStreamer.Config cfg, Function condition) throws Exception { Future dsFuture = Executors.newSingleThreadExecutor().submit(() -> { try { ds.sync(); @@ -949,12 +707,13 @@ private void deltaStreamerTestRunner(HoodieDeltaStreamer ds, HoodieDeltaStreamer dsFuture.get(); } - private void deltaStreamerTestRunner(HoodieDeltaStreamer ds, Function condition) throws Exception { + static void deltaStreamerTestRunner(HoodieDeltaStreamer ds, Function condition) throws Exception { deltaStreamerTestRunner(ds, null, condition); } - @Test - public void testInlineClustering() throws Exception { + @ParameterizedTest + @ValueSource(strings = {"true", "false"}) + public void testInlineClustering(String preserveCommitMetadata) throws Exception { String tableBasePath = dfsBasePath + "/inlineClustering"; // Keep it higher than batch-size to test continuous mode int totalRecords = 3000; @@ -963,7 +722,7 @@ public void testInlineClustering() throws Exception { HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT); cfg.continuousMode = true; cfg.tableType = HoodieTableType.MERGE_ON_READ.name(); - cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", "")); + cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", "", preserveCommitMetadata)); HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc); deltaStreamerTestRunner(ds, cfg, (r) -> { TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs); @@ -972,6 +731,95 @@ public void testInlineClustering() throws Exception { }); } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean) throws Exception { + String tableBasePath = dfsBasePath + "/cleanerDeleteReplacedDataWithArchive" + asyncClean; + + int totalRecords = 3000; + + // Step 1 : Prepare and insert data without archival and cleaner. + // Make sure that there are 6 commits including 2 replacecommits completed. + HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT); + cfg.continuousMode = true; + cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); + cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", "")); + cfg.configs.add(String.format("%s=%s", HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), "0")); + cfg.configs.add(HoodieMetadataConfig.ENABLE.key() + "=false"); + HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc); + deltaStreamerTestRunner(ds, cfg, (r) -> { + TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs); + return true; + }); + + TestHelpers.assertAtLeastNCommits(6, tableBasePath, dfs); + TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs); + + // Step 2 : Get the first replacecommit and extract the corresponding replaced file IDs. + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build(); + HoodieTimeline replacedTimeline = meta.reloadActiveTimeline().getCompletedReplaceTimeline(); + Option firstReplaceHoodieInstant = replacedTimeline.nthFromLastInstant(1); + assertTrue(firstReplaceHoodieInstant.isPresent()); + + Option firstReplaceHoodieInstantDetails = replacedTimeline.getInstantDetails(firstReplaceHoodieInstant.get()); + HoodieReplaceCommitMetadata firstReplaceMetadata = HoodieReplaceCommitMetadata.fromBytes(firstReplaceHoodieInstantDetails.get(), HoodieReplaceCommitMetadata.class); + Map> partitionToReplaceFileIds = firstReplaceMetadata.getPartitionToReplaceFileIds(); + String partitionName = null; + List replacedFileIDs = null; + for (Map.Entry entry : partitionToReplaceFileIds.entrySet()) { + partitionName = String.valueOf(entry.getKey()); + replacedFileIDs = (List) entry.getValue(); + } + + assertNotNull(partitionName); + assertNotNull(replacedFileIDs); + + // Step 3 : Based to replacedFileIDs , get the corresponding complete path. + ArrayList replacedFilePaths = new ArrayList<>(); + Path partitionPath = new Path(meta.getBasePath(), partitionName); + RemoteIterator hoodieFiles = meta.getFs().listFiles(partitionPath, true); + while (hoodieFiles.hasNext()) { + LocatedFileStatus f = hoodieFiles.next(); + String file = f.getPath().toUri().toString(); + for (Object replacedFileID : replacedFileIDs) { + if (file.contains(String.valueOf(replacedFileID))) { + replacedFilePaths.add(file); + } + } + } + + assertFalse(replacedFilePaths.isEmpty()); + + // Step 4 : Insert 1 record and trigger sync/async cleaner and archive. + List configs = getAsyncServicesConfigs(1, "true", "true", "2", "", ""); + configs.add(String.format("%s=%s", HoodieCompactionConfig.CLEANER_POLICY.key(), "KEEP_LATEST_COMMITS")); + configs.add(String.format("%s=%s", HoodieCompactionConfig.CLEANER_COMMITS_RETAINED.key(), "1")); + configs.add(String.format("%s=%s", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP.key(), "2")); + configs.add(String.format("%s=%s", HoodieCompactionConfig.MAX_COMMITS_TO_KEEP.key(), "3")); + configs.add(String.format("%s=%s", HoodieCompactionConfig.ASYNC_CLEAN, asyncClean)); + configs.add(HoodieMetadataConfig.ENABLE.key() + "=false"); + cfg.configs = configs; + cfg.continuousMode = false; + ds = new HoodieDeltaStreamer(cfg, jsc); + ds.sync(); + + // Step 5 : Make sure that firstReplaceHoodieInstant is archived. + long count = meta.reloadActiveTimeline().getCompletedReplaceTimeline().getInstants().filter(instant -> firstReplaceHoodieInstant.get().equals(instant)).count(); + assertEquals(0, count); + + // Step 6 : All the replaced files in firstReplaceHoodieInstant should be deleted through sync/async cleaner. + for (String replacedFilePath : replacedFilePaths) { + assertFalse(meta.getFs().exists(new Path(replacedFilePath))); + } + } + + private List getAsyncServicesConfigs(int totalRecords, String autoClean, String inlineCluster, String inlineClusterMaxCommit, + String asyncCluster, String asyncClusterMaxCommit, String preserveCommitMetadata) { + List configs = getAsyncServicesConfigs(totalRecords, autoClean, inlineCluster, inlineClusterMaxCommit, asyncCluster, asyncClusterMaxCommit); + configs.add(String.format("%s=%s", HoodieClusteringConfig.PRESERVE_COMMIT_METADATA.key(), preserveCommitMetadata)); + return configs; + } + private List getAsyncServicesConfigs(int totalRecords, String autoClean, String inlineCluster, String inlineClusterMaxCommit, String asyncCluster, String asyncClusterMaxCommit) { List configs = new ArrayList<>(); @@ -1064,8 +912,9 @@ public void testAsyncClusteringService() throws Exception { }); } - @Test - public void testAsyncClusteringServiceWithCompaction() throws Exception { + @ParameterizedTest + @ValueSource(strings = {"true", "false"}) + public void testAsyncClusteringServiceWithCompaction(String preserveCommitMetadata) throws Exception { String tableBasePath = dfsBasePath + "/asyncClusteringCompaction"; // Keep it higher than batch-size to test continuous mode int totalRecords = 3000; @@ -1074,7 +923,7 @@ public void testAsyncClusteringServiceWithCompaction() throws Exception { HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT); cfg.continuousMode = true; cfg.tableType = HoodieTableType.MERGE_ON_READ.name(); - cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", "true", "2")); + cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", "true", "2", preserveCommitMetadata)); HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc); deltaStreamerTestRunner(ds, cfg, (r) -> { TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs); @@ -1211,7 +1060,7 @@ public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() t "Table partitions should match the number of partitions we wrote"); assertEquals(lastInstantForUpstreamTable, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), - "The last commit that was sycned should be updated in the TBLPROPERTIES"); + "The last commit that was synced should be updated in the TBLPROPERTIES"); } @Test @@ -1369,7 +1218,7 @@ private void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTrans TypedProperties parquetProps = new TypedProperties(); if (addCommonProps) { - populateCommonProps(parquetProps); + populateCommonProps(parquetProps, dfsBasePath); } parquetProps.setProperty("include", "base.properties"); @@ -1398,10 +1247,38 @@ private void testParquetDFSSource(boolean useSchemaProvider, List transf testNum++; } + private void testORCDFSSource(boolean useSchemaProvider, List transformerClassNames) throws Exception { + // prepare ORCDFSSource + TypedProperties orcProps = new TypedProperties(); + + // Properties used for testing delta-streamer with orc source + orcProps.setProperty("include", "base.properties"); + orcProps.setProperty("hoodie.embed.timeline.server","false"); + orcProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + orcProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); + if (useSchemaProvider) { + orcProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + "source.avsc"); + if (transformerClassNames != null) { + orcProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/" + "target.avsc"); + } + } + orcProps.setProperty("hoodie.deltastreamer.source.dfs.root", ORC_SOURCE_ROOT); + UtilitiesTestBase.Helpers.savePropsToDFS(orcProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_ORC); + + String tableBasePath = dfsBasePath + "/test_orc_source_table" + testNum; + HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer( + TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, ORCDFSSource.class.getName(), + transformerClassNames, PROPS_FILENAME_TEST_ORC, false, + useSchemaProvider, 100000, false, null, null, "timestamp", null), jsc); + deltaStreamer.sync(); + TestHelpers.assertRecordCount(ORC_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext); + testNum++; + } + private void prepareJsonKafkaDFSSource(String propsFileName, String autoResetValue, String topicName) throws IOException { // Properties used for testing delta-streamer with JsonKafka source TypedProperties props = new TypedProperties(); - populateAllCommonProps(props); + populateAllCommonProps(props, dfsBasePath, testUtils.brokerAddress()); props.setProperty("include", "base.properties"); props.setProperty("hoodie.embed.timeline.server", "false"); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); @@ -1430,8 +1307,8 @@ private void testDeltaStreamerTransitionFromParquetToKafkaSource(boolean autoRes prepareParquetDFSSource(true, false, "source_uber.avsc", "target_uber.avsc", PROPS_FILENAME_TEST_PARQUET, PARQUET_SOURCE_ROOT, false); - // delta streamer w/ parquest source - String tableBasePath = dfsBasePath + "/test_dfs_to_kakfa" + testNum; + // delta streamer w/ parquet source + String tableBasePath = dfsBasePath + "/test_dfs_to_kafka" + testNum; HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer( TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, ParquetDFSSource.class.getName(), Collections.EMPTY_LIST, PROPS_FILENAME_TEST_PARQUET, false, @@ -1538,6 +1415,12 @@ public void testParquetDFSSourceWithSchemaFilesAndTransformer() throws Exception testParquetDFSSource(true, Collections.singletonList(TripsWithDistanceTransformer.class.getName())); } + @ParameterizedTest + @MethodSource("testORCDFSSource") + public void testORCDFSSourceWithoutSchemaProviderAndNoTransformer(boolean useSchemaProvider, List transformerClassNames) throws Exception { + testORCDFSSource(useSchemaProvider, transformerClassNames); + } + private void prepareCsvDFSSource( boolean hasHeader, char sep, boolean useSchemaProvider, boolean hasTransformer) throws IOException { String sourceRoot = dfsBasePath + "/csvFiles"; @@ -1852,4 +1735,12 @@ public Schema getTargetSchema() { } } + private static Stream testORCDFSSource() { + // arg1 boolean useSchemaProvider, arg2 List transformerClassNames + return Stream.of( + arguments(false, null), + arguments(true, Collections.singletonList(TripsWithDistanceTransformer.class.getName())) + ); + } + } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerWithMultiWriter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerWithMultiWriter.java new file mode 100644 index 0000000000000..3cdf5f9027a10 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerWithMultiWriter.java @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities.functional; + +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; +import org.apache.hudi.utilities.sources.TestDataSource; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.apache.hudi.utilities.testutils.sources.config.SourceConfigs; + +import org.apache.hadoop.fs.FileSystem; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.ConcurrentModificationException; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.function.Function; + +import static org.apache.hudi.common.testutils.FixtureUtils.prepareFixtureTable; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.config.HoodieWriteConfig.BULKINSERT_PARALLELISM_VALUE; +import static org.apache.hudi.config.HoodieWriteConfig.BULK_INSERT_SORT_MODE; +import static org.apache.hudi.config.HoodieWriteConfig.FINALIZE_WRITE_PARALLELISM_VALUE; +import static org.apache.hudi.config.HoodieWriteConfig.INSERT_PARALLELISM_VALUE; +import static org.apache.hudi.config.HoodieWriteConfig.UPSERT_PARALLELISM_VALUE; +import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY; +import static org.apache.hudi.utilities.functional.HoodieDeltaStreamerTestBase.PROPS_FILENAME_TEST_MULTI_WRITER; +import static org.apache.hudi.utilities.functional.HoodieDeltaStreamerTestBase.defaultSchemaProviderClassName; +import static org.apache.hudi.utilities.functional.HoodieDeltaStreamerTestBase.prepareInitialConfigs; +import static org.apache.hudi.utilities.functional.TestHoodieDeltaStreamer.deltaStreamerTestRunner; +import static org.apache.hudi.utilities.testutils.sources.AbstractBaseTestSource.DEFAULT_PARTITION_NUM; +import static org.apache.hudi.utilities.testutils.sources.AbstractBaseTestSource.dataGeneratorMap; +import static org.apache.hudi.utilities.testutils.sources.AbstractBaseTestSource.initDataGen; + +@Tag("functional") +public class TestHoodieDeltaStreamerWithMultiWriter extends SparkClientFunctionalTestHarness { + + String basePath; + String propsFilePath; + String tableBasePath; + int totalRecords; + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + void testUpsertsContinuousModeWithMultipleWriters(HoodieTableType tableType) throws Exception { + // NOTE : Overriding the LockProvider to FileSystemBasedLockProviderTestClass since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts + setUpTestTable(tableType); + prepareInitialConfigs(fs(), basePath, "foo"); + // enable carrying forward latest checkpoint + TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath); + props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); + props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); + props.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "3"); + props.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "5000"); + UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath); + + HoodieDeltaStreamer.Config cfgIngestionJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); + cfgIngestionJob.continuousMode = true; + cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + + // create a backfill job + HoodieDeltaStreamer.Config cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); + cfgBackfillJob.continuousMode = false; + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build(); + HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class); + cfgBackfillJob.checkpoint = commitMetadata.getMetadata(CHECKPOINT_KEY); + cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc()); + + // re-init ingestion job to start sync service + HoodieDeltaStreamer ingestionJob2 = new HoodieDeltaStreamer(cfgIngestionJob, jsc()); + + // run ingestion & backfill in parallel, create conflict and fail one + runJobsInParallel(tableBasePath, tableType, totalRecords, ingestionJob2, + cfgIngestionJob, backfillJob, cfgBackfillJob, true); + + // create new ingestion & backfill job config to generate only INSERTS to avoid conflict + props = prepareMultiWriterProps(fs(), basePath, propsFilePath); + props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); + props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); + props.setProperty("hoodie.test.source.generate.inserts", "true"); + UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER); + cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.INSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TestIdentityTransformer.class.getName())); + cfgBackfillJob.continuousMode = false; + meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build(); + timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + commitMetadata = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class); + cfgBackfillJob.checkpoint = commitMetadata.getMetadata(CHECKPOINT_KEY); + cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + + cfgIngestionJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TestIdentityTransformer.class.getName())); + cfgIngestionJob.continuousMode = true; + cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + // re-init ingestion job + HoodieDeltaStreamer ingestionJob3 = new HoodieDeltaStreamer(cfgIngestionJob, jsc()); + // re-init backfill job + HoodieDeltaStreamer backfillJob2 = new HoodieDeltaStreamer(cfgBackfillJob, jsc()); + + // run ingestion & backfill in parallel, avoid conflict and succeed both + runJobsInParallel(tableBasePath, tableType, totalRecords, ingestionJob3, + cfgIngestionJob, backfillJob2, cfgBackfillJob, false); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class, names = {"COPY_ON_WRITE"}) + void testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType tableType) throws Exception { + // NOTE : Overriding the LockProvider to FileSystemBasedLockProviderTestClass since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts + setUpTestTable(tableType); + prepareInitialConfigs(fs(), basePath, "foo"); + // enable carrying forward latest checkpoint + TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath); + props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); + props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); + UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath); + + HoodieDeltaStreamer.Config cfgIngestionJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); + cfgIngestionJob.continuousMode = true; + cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + + // create a backfill job with checkpoint from the first instant + HoodieDeltaStreamer.Config cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); + cfgBackfillJob.continuousMode = false; + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build(); + HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + HoodieCommitMetadata commitMetadataForFirstInstant = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class); + + // get current checkpoint after preparing base dataset with some commits + HoodieCommitMetadata commitMetadataForLastInstant = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(timeline.lastInstant().get()).get(), HoodieCommitMetadata.class); + String lastCheckpointBeforeParallelBackfill = commitMetadataForLastInstant.getMetadata(CHECKPOINT_KEY); + + // run the backfill job, enable overriding checkpoint from the latest commit + props = prepareMultiWriterProps(fs(), basePath, propsFilePath); + props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass"); + props.setProperty("hoodie.write.lock.filesystem.path", tableBasePath); + props.setProperty("hoodie.write.meta.key.prefixes", CHECKPOINT_KEY); + UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath); + + // reset checkpoint to first instant to simulate a random checkpoint for backfill job + // checkpoint will move from 00000 to 00001 for this backfill job + cfgBackfillJob.checkpoint = commitMetadataForFirstInstant.getMetadata(CHECKPOINT_KEY); + cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc()); + backfillJob.sync(); + + // check if the checkpoint is carried over + timeline = meta.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); + commitMetadataForLastInstant = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(timeline.lastInstant().get()).get(), HoodieCommitMetadata.class); + String lastCheckpointAfterParallelBackfill = commitMetadataForLastInstant.getMetadata(CHECKPOINT_KEY); + Assertions.assertEquals(lastCheckpointBeforeParallelBackfill, lastCheckpointAfterParallelBackfill); + } + + private static TypedProperties prepareMultiWriterProps(FileSystem fs, String basePath, String propsFilePath) throws IOException { + TypedProperties props = new TypedProperties(); + HoodieDeltaStreamerTestBase.populateCommonProps(props, basePath); + HoodieDeltaStreamerTestBase.populateCommonHiveProps(props); + + props.setProperty("include", "sql-transformer.properties"); + props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); + props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); + props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/source.avsc"); + props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/target.avsc"); + + props.setProperty("include", "base.properties"); + props.setProperty("hoodie.write.concurrency.mode", "optimistic_concurrency_control"); + props.setProperty("hoodie.cleaner.policy.failed.writes", "LAZY"); + props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider"); + props.setProperty("hoodie.write.lock.hivemetastore.database", "testdb1"); + props.setProperty("hoodie.write.lock.hivemetastore.table", "table1"); + props.setProperty("hoodie.write.lock.zookeeper.url", "127.0.0.1"); + props.setProperty("hoodie.write.lock.zookeeper.port", "2828"); + props.setProperty("hoodie.write.lock.wait_time_ms", "1200000"); + props.setProperty("hoodie.write.lock.num_retries", "10"); + props.setProperty("hoodie.write.lock.zookeeper.lock_key", "test_table"); + props.setProperty("hoodie.write.lock.zookeeper.base_path", "/test"); + props.setProperty(INSERT_PARALLELISM_VALUE.key(), "4"); + props.setProperty(UPSERT_PARALLELISM_VALUE.key(), "4"); + props.setProperty(BULKINSERT_PARALLELISM_VALUE.key(), "4"); + props.setProperty(FINALIZE_WRITE_PARALLELISM_VALUE.key(), "4"); + props.setProperty(BULK_INSERT_SORT_MODE.key(), BulkInsertSortMode.NONE.name()); + + UtilitiesTestBase.Helpers.savePropsToDFS(props, fs, propsFilePath); + return props; + } + + private static HoodieDeltaStreamer.Config getDeltaStreamerConfig(String basePath, + String tableType, WriteOperationType op, String propsFilePath, List transformerClassNames) { + HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config(); + cfg.targetBasePath = basePath; + cfg.targetTableName = "hoodie_trips"; + cfg.tableType = tableType; + cfg.sourceClassName = TestDataSource.class.getName(); + cfg.transformerClassNames = transformerClassNames; + cfg.operation = op; + cfg.enableHiveSync = false; + cfg.sourceOrderingField = "timestamp"; + cfg.propsFilePath = propsFilePath; + cfg.sourceLimit = 1000; + cfg.schemaProviderClassName = defaultSchemaProviderClassName; + return cfg; + } + + /** + * Specifically used for {@link TestHoodieDeltaStreamerWithMultiWriter}. + * + * The fixture test tables have random records generated by + * {@link org.apache.hudi.common.testutils.HoodieTestDataGenerator} using + * {@link org.apache.hudi.common.testutils.HoodieTestDataGenerator#TRIP_EXAMPLE_SCHEMA}. + * + * The COW fixture test table has 3000 unique records in 7 commits. + * The MOR fixture test table has 3000 unique records in 9 deltacommits and 1 compaction commit. + */ + private void setUpTestTable(HoodieTableType tableType) throws IOException { + basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString(); + propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER; + String fixtureName = String.format("fixtures/testUpsertsContinuousModeWithMultipleWriters.%s.zip", tableType.name()); + tableBasePath = prepareFixtureTable(Objects.requireNonNull(getClass() + .getClassLoader().getResource(fixtureName)), Paths.get(basePath)).toString(); + initDataGen(sqlContext(), tableBasePath + "/*/*.parquet", DEFAULT_PARTITION_NUM); + totalRecords = dataGeneratorMap.get(DEFAULT_PARTITION_NUM).getNumExistingKeys(TRIP_EXAMPLE_SCHEMA); + } + + private void runJobsInParallel(String tableBasePath, HoodieTableType tableType, int totalRecords, + HoodieDeltaStreamer ingestionJob, HoodieDeltaStreamer.Config cfgIngestionJob, HoodieDeltaStreamer backfillJob, + HoodieDeltaStreamer.Config cfgBackfillJob, boolean expectConflict) throws Exception { + ExecutorService service = Executors.newFixedThreadPool(2); + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build(); + HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + String lastSuccessfulCommit = timeline.lastInstant().get().getTimestamp(); + // Condition for parallel ingestion job + Function conditionForRegularIngestion = (r) -> { + if (tableType.equals(HoodieTableType.MERGE_ON_READ)) { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommitsAfterCommit(3, lastSuccessfulCommit, tableBasePath, fs()); + } else { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommitsAfterCommit(3, lastSuccessfulCommit, tableBasePath, fs()); + } + TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + return true; + }; + + try { + Future regularIngestionJobFuture = service.submit(() -> { + try { + deltaStreamerTestRunner(ingestionJob, cfgIngestionJob, conditionForRegularIngestion); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + }); + Future backfillJobFuture = service.submit(() -> { + try { + backfillJob.sync(); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + }); + backfillJobFuture.get(); + regularIngestionJobFuture.get(); + if (expectConflict) { + Assertions.fail("Failed to handle concurrent writes"); + } + } catch (Exception e) { + /* + * Need to perform getMessage().contains since the exception coming + * from {@link org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.DeltaSyncService} gets wrapped many times into RuntimeExceptions. + */ + if (expectConflict && e.getCause().getMessage().contains(ConcurrentModificationException.class.getName())) { + // expected ConcurrentModificationException since ingestion & backfill will have overlapping writes + } else { + throw e; + } + } + } + +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java index 3f2e86e2438b1..8eb91d24687c1 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java @@ -43,17 +43,17 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestHoodieMultiTableDeltaStreamer extends TestHoodieDeltaStreamerBase { +public class TestHoodieMultiTableDeltaStreamer extends HoodieDeltaStreamerTestBase { private static volatile Logger log = LogManager.getLogger(TestHoodieMultiTableDeltaStreamer.class); static class TestHelpers { - static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String configFolder, String sourceClassName, boolean enableHiveSync) { - return getConfig(fileName, configFolder, sourceClassName, enableHiveSync, true, "multi_table_dataset"); + static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String configFolder, String sourceClassName, boolean enableHiveSync, boolean enableMetaSync) { + return getConfig(fileName, configFolder, sourceClassName, enableHiveSync, enableMetaSync, true, "multi_table_dataset"); } - static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String configFolder, String sourceClassName, boolean enableHiveSync, + static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String configFolder, String sourceClassName, boolean enableHiveSync, boolean enableMetaSync, boolean setSchemaProvider, String basePathPrefix) { HoodieMultiTableDeltaStreamer.Config config = new HoodieMultiTableDeltaStreamer.Config(); config.configFolder = configFolder; @@ -67,23 +67,24 @@ static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String co config.schemaProviderClassName = FilebasedSchemaProvider.class.getName(); } config.enableHiveSync = enableHiveSync; + config.enableMetaSync = enableMetaSync; return config; } } @Test public void testInvalidHiveSyncProps() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); Exception e = assertThrows(HoodieException.class, () -> { new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Should fail when hive sync table not provided with enableHiveSync flag"); log.debug("Expected error when creating table execution objects", e); - assertTrue(e.getMessage().contains("Hive sync table field not provided!")); + assertTrue(e.getMessage().contains("Meta sync table field not provided!")); } @Test public void testInvalidPropsFilePath() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); Exception e = assertThrows(IllegalArgumentException.class, () -> { new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Should fail when invalid props file is provided"); @@ -93,7 +94,7 @@ public void testInvalidPropsFilePath() throws IOException { @Test public void testInvalidTableConfigFilePath() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_TABLE_CONFIG_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_TABLE_CONFIG_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); Exception e = assertThrows(IllegalArgumentException.class, () -> { new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Should fail when invalid table config props file path is provided"); @@ -103,7 +104,7 @@ public void testInvalidTableConfigFilePath() throws IOException { @Test public void testCustomConfigProps() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); TableExecutionContext executionContext = streamer.getTableExecutionContexts().get(1); assertEquals(2, streamer.getTableExecutionContexts().size()); @@ -119,7 +120,7 @@ public void testCustomConfigProps() throws IOException { @Disabled public void testInvalidIngestionProps() { Exception e = assertThrows(Exception.class, () -> { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Creation of execution object should fail without kafka topic"); log.debug("Creation of execution object failed with error: " + e.getMessage(), e); @@ -138,7 +139,7 @@ public void testMultiTableExecutionWithKafkaSource() throws IOException { testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 5, HoodieTestDataGenerator.TRIP_SCHEMA))); testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA))); - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); List executionContexts = streamer.getTableExecutionContexts(); TypedProperties properties = executionContexts.get(1).getProperties(); @@ -187,7 +188,7 @@ public void testMultiTableExecutionWithParquetSource() throws IOException { // add only common props. later we can add per table props String parquetPropsFile = populateCommonPropsAndWriteToFile(); - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(parquetPropsFile, dfsBasePath + "/config", ParquetDFSSource.class.getName(), false, + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(parquetPropsFile, dfsBasePath + "/config", ParquetDFSSource.class.getName(), false, false, false, "multi_table_parquet"); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); @@ -218,7 +219,7 @@ public void testMultiTableExecutionWithParquetSource() throws IOException { @Test public void testTableLevelProperties() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); List tableExecutionContexts = streamer.getTableExecutionContexts(); tableExecutionContexts.forEach(tableExecutionContext -> { @@ -236,7 +237,7 @@ public void testTableLevelProperties() throws IOException { private String populateCommonPropsAndWriteToFile() throws IOException { TypedProperties commonProps = new TypedProperties(); - populateCommonProps(commonProps); + populateCommonProps(commonProps, dfsBasePath); UtilitiesTestBase.Helpers.savePropsToDFS(commonProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_PARQUET); return PROPS_FILENAME_TEST_PARQUET; } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java index 15f702a8d4034..f192ede73a159 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java @@ -71,8 +71,7 @@ public void testEmptySnapshotCopy() throws IOException { // Do the snapshot HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); copier.snapshot(jsc(), basePath, outputPath, true, - HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue()); + HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS); // Nothing changed; we just bail out assertEquals(fs.listStatus(new Path(basePath)).length, 1); @@ -125,8 +124,7 @@ public void testSnapshotCopy() throws Exception { // Do a snapshot copy HoodieSnapshotCopier copier = new HoodieSnapshotCopier(); - copier.snapshot(jsc(), basePath, outputPath, false, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, - HoodieMetadataConfig.VALIDATE_ENABLE.defaultValue()); + copier.snapshot(jsc(), basePath, outputPath, false, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS); // Check results assertTrue(fs.exists(new Path(outputPath + "/2016/05/01/" + file11.getName()))); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java index c977b79cb2c25..541da0a554fa4 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java @@ -203,6 +203,8 @@ public void testExportDatasetWithNoCommit() throws IOException { public void testExportDatasetWithNoPartition() throws IOException { // delete all source data lfs.delete(new Path(sourcePath + "/" + PARTITION_PATH), true); + // delete hudi metadata table too. + lfs.delete(new Path(cfg.sourceBasePath + "/" + ".hoodie/metadata"), true); // export final Throwable thrown = assertThrows(HoodieSnapshotExporterException.class, () -> { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java index 7dd8af689acf1..938f71c10318f 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java @@ -32,7 +32,6 @@ import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import java.io.IOException; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; @@ -72,11 +71,11 @@ public void testJdbcbasedSchemaProvider() throws Exception { * Initialize the H2 database and obtain a connection, then create a table as a test. * Based on the characteristics of the H2 in-memory database, we do not need to display the initialized database. * @throws SQLException - * @throws IOException */ - private void initH2Database() throws SQLException, IOException { - Connection conn = DriverManager.getConnection("jdbc:h2:mem:test_mem", "sa", ""); - PreparedStatement ps = conn.prepareStatement(UtilitiesTestBase.Helpers.readFile("delta-streamer-config/triprec.sql")); - ps.executeUpdate(); + private void initH2Database() throws SQLException { + try (Connection conn = DriverManager.getConnection("jdbc:h2:mem:test_mem", "sa", "")) { + PreparedStatement ps = conn.prepareStatement(UtilitiesTestBase.Helpers.readFile("delta-streamer-config/triprec.sql")); + ps.executeUpdate(); + } } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java index da11035c90d20..2ed4c42582c3c 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java @@ -151,6 +151,28 @@ public void testJsonKafkaSource() { assertEquals(Option.empty(), fetch4AsRows.getBatch()); } + // test whether empty messages can be filtered + @Test + public void testJsonKafkaSourceFilterNullMsg() { + // topic setup. + testUtils.createTopic(TEST_TOPIC_NAME, 2); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + TypedProperties props = createPropsForJsonSource(null, "earliest"); + + Source jsonSource = new JsonKafkaSource(props, jsc, sparkSession, schemaProvider, metrics); + SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource); + + // 1. Extract without any checkpoint => get all the data, respecting sourceLimit + assertEquals(Option.empty(), kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch()); + // Send 1000 non-null messages to Kafka + testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000))); + // Send 100 null messages to Kafka + testUtils.sendMessages(TEST_TOPIC_NAME,new String[100]); + InputBatch> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE); + // Verify that messages with null values are filtered + assertEquals(1000, fetch1.getBatch().get().count()); + } + // test case with kafka offset reset strategy @Test public void testJsonKafkaSourceResetStrategy() { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java index 8bff47522b13c..bb00d2fef7324 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java @@ -28,6 +28,7 @@ import org.apache.hudi.common.testutils.RawTripTestPayload; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService; +import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; @@ -57,6 +58,11 @@ import org.apache.hive.service.server.HiveServer2; import org.apache.log4j.Level; import org.apache.log4j.Logger; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.hadoop.ParquetFileWriter.Mode; import org.apache.parquet.hadoop.ParquetWriter; @@ -314,6 +320,27 @@ public static void saveParquetToDFS(List records, Path targetFile } } + public static void saveORCToDFS(List records, Path targetFile) throws IOException { + saveORCToDFS(records, targetFile, HoodieTestDataGenerator.ORC_SCHEMA); + } + + public static void saveORCToDFS(List records, Path targetFile, TypeDescription schema) throws IOException { + OrcFile.WriterOptions options = OrcFile.writerOptions(HoodieTestUtils.getDefaultHadoopConf()).setSchema(schema); + try (Writer writer = OrcFile.createWriter(targetFile, options)) { + VectorizedRowBatch batch = schema.createRowBatch(); + for (GenericRecord record : records) { + addAvroRecord(batch, record, schema); + batch.size++; + if (batch.size % records.size() == 0 || batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + batch.size = 0; + } + } + writer.addRowBatch(batch); + } + } + public static TypedProperties setupSchemaOnDFS() throws IOException { return setupSchemaOnDFS("delta-streamer-config", "source.avsc"); } @@ -364,5 +391,21 @@ public static String toJsonString(HoodieRecord hr) { public static String[] jsonifyRecords(List records) { return records.stream().map(Helpers::toJsonString).toArray(String[]::new); } + + private static void addAvroRecord( + VectorizedRowBatch batch, + GenericRecord record, + TypeDescription orcSchema + ) { + for (int c = 0; c < batch.numCols; c++) { + ColumnVector colVector = batch.cols[c]; + final String thisField = orcSchema.getFieldNames().get(c); + final TypeDescription type = orcSchema.getChildren().get(c); + + Object fieldValue = record.get(thisField); + Schema.Field avroField = record.getSchema().getField(thisField); + AvroOrcUtils.addToVector(type, colVector, avroField.schema(), fieldValue, batch.size); + } + } } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java index 524591dd7bca5..5186179c95877 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java @@ -19,6 +19,7 @@ package org.apache.hudi.utilities.testutils.sources; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.RawTripTestPayload; @@ -32,12 +33,18 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SparkSession; import java.io.File; import java.io.IOException; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; public abstract class AbstractBaseTestSource extends AvroSource { @@ -47,7 +54,7 @@ public abstract class AbstractBaseTestSource extends AvroSource { public static final int DEFAULT_PARTITION_NUM = 0; // Static instance, helps with reuse across a test. - protected static transient Map dataGeneratorMap = new HashMap<>(); + public static transient Map dataGeneratorMap = new HashMap<>(); public static void initDataGen() { dataGeneratorMap.putIfAbsent(DEFAULT_PARTITION_NUM, @@ -68,6 +75,23 @@ public static void initDataGen(TypedProperties props, int partition) { } } + public static void initDataGen(SQLContext sqlContext, String globParquetPath, int partition) { + List rows = sqlContext.read().format("hudi").load(globParquetPath) + .select("_hoodie_record_key", "_hoodie_partition_path") + .collectAsList(); + Map keyPartitionMap = IntStream + .range(0, rows.size()).boxed() + .collect(Collectors.toMap(Function.identity(), i -> { + Row r = rows.get(i); + HoodieTestDataGenerator.KeyPartition kp = new HoodieTestDataGenerator.KeyPartition(); + kp.key = new HoodieKey(r.getString(0), r.getString(1)); + kp.partitionPath = r.getString(1); + return kp; + })); + dataGeneratorMap.put(partition, + new HoodieTestDataGenerator(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, keyPartitionMap)); + } + public static void resetDataGen() { for (HoodieTestDataGenerator dataGenerator : dataGeneratorMap.values()) { dataGenerator.close(); diff --git a/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.COPY_ON_WRITE.zip b/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.COPY_ON_WRITE.zip new file mode 100644 index 0000000000000..48bf278bd6c6f Binary files /dev/null and b/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.COPY_ON_WRITE.zip differ diff --git a/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.MERGE_ON_READ.zip b/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.MERGE_ON_READ.zip new file mode 100644 index 0000000000000..657f83c2d0a59 Binary files /dev/null and b/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.MERGE_ON_READ.zip differ diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml index fea8849278d4d..868912eea4ff7 100644 --- a/packaging/hudi-flink-bundle/pom.xml +++ b/packaging/hudi-flink-bundle/pom.xml @@ -110,6 +110,7 @@ com.twitter:bijection-core_${scala.binary.version} io.dropwizard.metrics:metrics-core io.dropwizard.metrics:metrics-graphite + io.dropwizard.metrics:metrics-jmx io.prometheus:simpleclient io.prometheus:simpleclient_httpserver io.prometheus:simpleclient_dropwizard @@ -143,10 +144,18 @@ org.apache.flink:flink-sql-connector-hive-2.3.6_${scala.binary.version} org.apache.hbase:hbase-common + org.apache.hbase:hbase-client + org.apache.hbase:hbase-server + org.apache.hbase:hbase-protocol + org.apache.htrace:htrace-core commons-codec:commons-codec + + javax.servlet. + ${flink.bundle.shade.prefix}javax.servlet. + org.apache.avro. ${flink.bundle.shade.prefix}org.apache.avro. @@ -191,6 +200,10 @@ org.apache.hadoop.hive.service. ${flink.bundle.shade.prefix}org.apache.hadoop.hive.service. + + org.apache.hadoop.hive.ql.metadata. + ${flink.bundle.shade.prefix}org.apache.hadoop.hive.ql.metadata. + com.codahale.metrics. ${flink.bundle.shade.prefix}com.codahale.metrics. @@ -593,6 +606,53 @@ + + org.apache.hbase + hbase-server + ${hbase.version} + compile + + + guava + com.google.guava + + + org.apache.hbase + hbase-common + + + javax.servlet + * + + + org.codehaus.jackson + * + + + org.mortbay.jetty + * + + + tomcat + * + + + + + org.apache.hbase + hbase-client + ${hbase.version} + + + org.apache.hbase + hbase-protocol + ${hbase.version} + + + org.apache.htrace + htrace-core + ${htrace.version} + @@ -634,15 +694,5 @@ - - include-flink-sql-connector-hive - - - org.apache.flink - flink-sql-connector-hive-2.3.6_${scala.binary.version} - ${flink.version} - - - diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index 152a127cbf1ce..76ede4738956b 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -168,12 +168,22 @@ compile + + org.apache.hbase + hbase-common + ${hbase.version} + + org.apache.hbase hbase-server ${hbase.version} compile + + org.apache.hbase + hbase-common + javax.servlet * diff --git a/packaging/hudi-integ-test-bundle/pom.xml b/packaging/hudi-integ-test-bundle/pom.xml index 3207fb8943575..7518e7b44ef0f 100644 --- a/packaging/hudi-integ-test-bundle/pom.xml +++ b/packaging/hudi-integ-test-bundle/pom.xml @@ -353,6 +353,7 @@ org.apache.hadoop hadoop-hdfs tests + test org.apache.hadoop diff --git a/packaging/hudi-kafka-connect-bundle/pom.xml b/packaging/hudi-kafka-connect-bundle/pom.xml index 14bc4e4c87b55..debbfa7850648 100644 --- a/packaging/hudi-kafka-connect-bundle/pom.xml +++ b/packaging/hudi-kafka-connect-bundle/pom.xml @@ -74,6 +74,8 @@ commons-httpclient:commons-httpclient org.apache.htrace:htrace-core org.jamon:jamon-runtime + org.slf4j:* + log4j:log4j jdk.tools:jdk.tools junit:junit @@ -155,10 +157,19 @@ compile + + + com.google.protobuf + protobuf-java + ${proto.version} + compile + + org.apache.hadoop hadoop-common + ${hadoop.version} compile diff --git a/packaging/hudi-spark-bundle/pom.xml b/packaging/hudi-spark-bundle/pom.xml index fce529dcc1f6b..d55b39493dca2 100644 --- a/packaging/hudi-spark-bundle/pom.xml +++ b/packaging/hudi-spark-bundle/pom.xml @@ -91,6 +91,7 @@ com.twitter:bijection-core_${scala.binary.version} io.dropwizard.metrics:metrics-core io.dropwizard.metrics:metrics-graphite + io.dropwizard.metrics:metrics-jmx io.prometheus:simpleclient io.prometheus:simpleclient_httpserver io.prometheus:simpleclient_dropwizard @@ -320,6 +321,10 @@ ${hbase.version} compile + + org.apache.hbase + hbase-common + javax.servlet * diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml index 148c20514e7ba..63b3b7da8a4bd 100644 --- a/packaging/hudi-utilities-bundle/pom.xml +++ b/packaging/hudi-utilities-bundle/pom.xml @@ -36,6 +36,28 @@ org.apache.rat apache-rat-plugin + + false + 0 + + NOTICE + DISCLAIMER + **/.* + **/*.json + **/*.log + **/*.sqltemplate + **/compose_env + **/*NOTICE* + **/*LICENSE* + **/dependency-reduced-pom.xml + **/test/resources/*.data + **/test/resources/*.commit + **/target/** + **/generated-sources/** + .github/** + **/*.sql + + org.apache.maven.plugins @@ -317,6 +339,10 @@ ${hbase.version} compile + + org.apache.hbase + hbase-common + javax.servlet * diff --git a/pom.xml b/pom.xml index a1beac06a32c2..888d8d32c6ed1 100644 --- a/pom.xml +++ b/pom.xml @@ -99,7 +99,7 @@ 1.7.0-M1 3.3.3 1.2.17 - 1.7.15 + 1.7.30 2.9.9 2.7.3 org.apache.hive @@ -114,7 +114,7 @@ ${spark2bundle.version} 1.13.1 2.4.4 - 3.0.0 + 3.1.2 3 hudi-spark2 @@ -156,6 +156,8 @@ 2.7.1 4.7 1.12.22 + 3.17.3 + 3.1.0 @@ -781,6 +783,7 @@ org.apache.hadoop hadoop-hdfs tests + test ${hadoop.version} @@ -1061,10 +1064,6 @@ confluent https://packages.confluent.io/maven/ - - pentaho.org - https://public.nexus.pentaho.org/repository/proxy-public-3rd-party-release/ - @@ -1473,6 +1472,7 @@ ${scala12.version} 2.12 hudi-spark3 + 3.1.0 2.4.1 ${fasterxml.spark3.version} ${fasterxml.spark3.version} @@ -1488,6 +1488,16 @@ + + spark3.0.x + + + 3.0.0 + ${spark3.version} + 3.0.1 + + + skipShadeSources diff --git a/rfc/README.md b/rfc/README.md new file mode 100644 index 0000000000000..5ef97300fcc35 --- /dev/null +++ b/rfc/README.md @@ -0,0 +1,62 @@ + +# RFCs + - The RFC process is documented on our [site](https://hudi.apache.org/contribute/rfc-process). Please familiarize yourself with it, before working a new RFC. + - Status can be one of these values: `UNDER REVIEW` (or) `IN PROGRESS` (or) `ABANDONED` (or) `COMPLETED`. + +The list of all RFCs can be found here. + +> Older RFC content is still [here](https://cwiki.apache.org/confluence/display/HUDI/RFC+Process). + +| RFC Number | Title | Status | +| ---| ---| --- | +| 1 | [CSV Source Support for Delta Streamer](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+01+%3A+CSV+Source+Support+for+Delta+Streamer) | `COMPLETED` | +| 2 | [ORC Storage in Hudi](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=113708439) | `IN PROGRESS` | +| 3 | [Timeline Service with Incremental File System View Syncing](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=113708965) | `COMPLETED` | +| 4 | [Faster Hive incremental pull queries](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=115513622) | `COMPLETED` | +| 5 | [HUI (Hudi WebUI)](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=130027233) | `ABANDONED` | +| 6 | [Add indexing support to the log file](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+06+%3A+Add+indexing+support+to+the+log+file) | `ABANDONED` | +| 7 | [Point in time Time-Travel queries on Hudi table](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+07+%3A+Point+in+time+Time-Travel+queries+on+Hudi+table) | `COMPLETED` | +| 8 | [Record level indexing mechanisms for Hudi datasets](https://cwiki.apache.org/confluence/display/HUDI/RFC-08++Record+level+indexing+mechanisms+for+Hudi+datasets) | `UNDER REVIEW` | +| 9 | [Hudi Dataset Snapshot Exporter](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+09+%3A+Hudi+Dataset+Snapshot+Exporter) | `COMPLETED` | +| 10 | [Restructuring and auto-generation of docs](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+10+%3A+Restructuring+and+auto-generation+of+docs) | `COMPLETED` | +| 11 | [Refactor of the configuration framework of hudi project](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+11+%3A+Refactor+of+the+configuration+framework+of+hudi+project) | `ABANDONED` | +| 12 | [Efficient Migration of Large Parquet Tables to Apache Hudi](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+12+%3A+Efficient+Migration+of+Large+Parquet+Tables+to+Apache+Hudi) | `COMPLETED` | +| 13 | [Integrate Hudi with Flink](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=141724520) | `COMPLETED` | +| 14 | [JDBC incremental puller](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+14+%3A+JDBC+incremental+puller) | `COMPLETED` | +| 15 | [HUDI File Listing Improvements](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+15%3A+HUDI+File+Listing+Improvements) | `COMPLETED` | +| 16 | [Abstraction for HoodieInputFormat and RecordReader](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+16+Abstraction+for+HoodieInputFormat+and+RecordReader) | `COMPLETED` | +| 17 | [Abstract common meta sync module support multiple meta service](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+17+Abstract+common+meta+sync+module+support+multiple+meta+service) | `COMPLETED` | +| 18 | [Insert Overwrite API](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+18+Insert+Overwrite+API) | `COMPLETED` | +| 19 | [Clustering data for freshness and query performance](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance) | `COMPLETED` | +| 20 | [handle failed records](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+20+%3A+handle+failed+records) | `IN PROGRESS` | +| 21 | [Allow HoodieRecordKey to be Virtual](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+21+%3A+Allow+HoodieRecordKey+to+be+Virtual) | `COMPLETED` | +| 22 | [Snapshot Isolation using Optimistic Concurrency Control for multi-writers](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+22+%3A+Snapshot+Isolation+using+Optimistic+Concurrency+Control+for+multi-writers) | `COMPLETED` | +| 23 | [Hudi Observability metrics collection](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+23+%3A+Hudi+Observability+metrics+collection) | `UNDER REVIEW` | +| 24 | [Hoodie Flink Writer Proposal](https://cwiki.apache.org/confluence/display/HUDI/RFC-24%3A+Hoodie+Flink+Writer+Proposal) | `UNDER REVIEW` | +| 25 | [Spark SQL Extension For Hudi](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+25%3A+Spark+SQL+Extension+For+Hudi) | `COMPLETED` | +| 26 | [Optimization For Hudi Table Query](https://cwiki.apache.org/confluence/display/HUDI/RFC-26+Optimization+For+Hudi+Table+Query) | `IN PROGRESS` | +| 27 | [Data skipping index to improve query performance](https://cwiki.apache.org/confluence/display/HUDI/RFC-27+Data+skipping+index+to+improve+query+performance) | `IN PROGRESS` | +| 28 | [Support Z-order curve](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=181307144) | `IN PROGRESS` | +| 29 | [Hash Index](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+29%3A+Hash+Index) | `IN PROGRESS` | +| 30 | [Batch operation](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+30%3A+Batch+operation) | `UNDER REVIEW` | +| 31 | [Hive integration Improvment](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+31%3A+Hive+integration+Improvment) | `UNDER REVIEW` | +| 32 | [Kafka Connect Sink for Hudi](https://cwiki.apache.org/confluence/display/HUDI/RFC-32+Kafka+Connect+Sink+for+Hudi) | `IN PROGRESS` | +| 33 | [Hudi supports more comprehensive Schema Evolution](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+33++Hudi+supports+more+comprehensive+Schema+Evolution) | `IN PROGRESS` | +| 34 | [Hudi BigQuery Integration (WIP)](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=188745980) | `UNDER REVIEW` | +| 35 | [Make Flink MOR table writing streaming friendly](https://cwiki.apache.org/confluence/display/HUDI/RFC-35%3A+Make+Flink+MOR+table+writing+streaming+friendly) | `UNDER REVIEW` | +| 36 | [HUDI Metastore Server](https://cwiki.apache.org/confluence/display/HUDI/%5BWIP%5D+RFC-36%3A+HUDI+Metastore+Server) | `UNDER REVIEW` | \ No newline at end of file diff --git a/rfc/template.md b/rfc/template.md new file mode 100644 index 0000000000000..fef8452407bf3 --- /dev/null +++ b/rfc/template.md @@ -0,0 +1,57 @@ + +# RFC-[number]: [Title] + + + +## Proposers + +- @ +- @ + +## Approvers + - @ + - @ + +## Status + +JIRA: + +> Please keep the status updated in `rfc/README.md`. + +## Abstract + +Describe the problem you are trying to solve and a brief description of why it’s needed + +## Background +Introduce any much background context which is relevant or necessary to understand the feature and design choices. + +## Implementation +Describe the new thing you want to do in appropriate detail, how it fits into the project architecture. +Provide a detailed description of how you intend to implement this feature.This may be fairly extensive and have large subsections of its own. +Or it may be a few sentences. Use judgement based on the scope of the change. + +## Rollout/Adoption Plan + + - What impact (if any) will there be on existing users? + - If we are changing behavior how will we phase out the older behavior? + - If we need special migration tools, describe them here. + - When will we remove the existing behavior + +## Test Plan + +Describe in few sentences how the RFC will be tested. How will we know that the implementation works as expected? How will we know nothing broke?. \ No newline at end of file diff --git a/scripts/dependency.sh b/scripts/dependency.sh new file mode 100755 index 0000000000000..29eed07e9cfd2 --- /dev/null +++ b/scripts/dependency.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -eou pipefail +#set -x +export LC_ALL=C + +PWD=$(cd "$(dirname "$0")"/.. || exit; pwd) + +function generate_dependencies() { + mvn --also-make dependency:tree -P $PROFILE | \ + grep maven-dependency-plugin | \ + grep bundle | \ + awk '{ + print $(NF-1); + }' | \ + while read line; do + FILE_NAME="${PWD}"/dependencies/"$line".txt + build_classpath "$line" "-P "$PROFILE $FILE_NAME + done +} + +function build_classpath() { + cat >"$3"<> "$3" +} + +function check_diff() { + mvn --also-make dependency:tree -P $PROFILE | \ + grep maven-dependency-plugin | \ + grep bundle | \ + awk '{ + print $(NF-1); + }' | \ + while read line; do + FILE_NAME="${PWD}"/dependencies/"$line".txt + BACKUP_FILE_NAME=$FILE_NAME".bkp" + mv $FILE_NAME $BACKUP_FILE_NAME + build_classpath "$line" "-P "$PROFILE $FILE_NAME + set +e + the_diff=$(diff $FILE_NAME $BACKUP_FILE_NAME) + set -e + rm -rf "$BACKUP_FILE_NAME" + if [[ -n $the_diff ]]; then + echo "Dependency List Changed Detected [$line]: " + echo ${the_diff} + echo "To update the dependency file, refer to the usage:" + printUsage + exit 1 + fi + done +} + +function printUsage() { + echo "Usage: $(basename "${0}") [-p ] -c " 2>&1 + echo ' -c [OPTIONAL] to check the dependencies diff' + echo ' -p [MUST] to generate new dependencyList file for all bundle module with given profile list' +} + +if [[ ${#} -eq 0 ]]; then + printUsage +fi + +PROFILE='' +CHECK_DIFF='false' + +while getopts "cp:" arg; do + case "${arg}" in + c) + CHECK_DIFF="true" + ;; + p) + PROFILE=$OPTARG + ;; + ?) + printUsage + ;; + esac +done + +shift "$(( OPTIND - 1 ))" + +# check must option +if [ -z "$PROFILE" ]; then + echo 'Missing -p argument' >&2 + exit 1 +fi + +if [ $CHECK_DIFF == "true" ]; then + check_diff +else + generate_dependencies +fi \ No newline at end of file