diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDropPartitionsTool.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDropPartitionsTool.java new file mode 100644 index 0000000000000..50fdf36c81701 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDropPartitionsTool.java @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hudi.DataSourceUtils; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.hive.HiveSyncTool; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.HoodieSparkTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +import scala.Tuple2; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * A tool with spark-submit to drop Hudi table partitions. + * + *
+ * You can dry run this tool with the following command to look and print for the table partitions and corresponding data files which will be deleted. + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master local[*] + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath \ + * --table-name tableName \ + * --mode dry_run \ + * --partitions partition1,partition2 + * ``` + * + *
+ * + * You can delete the table partitions with '--mode delete' + * + * - DELETE ("delete"): This tool will mask/tombstone these partitions and corresponding data files and let cleaner delete these files later. + * - Also you can set --sync-hive-meta to sync current drop partition into hive + *
+ * Example command:
+ * ```
+ * spark-submit \
+ * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \
+ * --packages org.apache.spark:spark-avro_2.11:2.4.4 \
+ * --master local[*]
+ * --driver-memory 1g \
+ * --executor-memory 1g \
+ * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \
+ * --base-path basePath \
+ * --table-name tableName \
+ * --mode delete \
+ * --partitions partition1,partition2
+ * ```
+ *
+ * Also you can use --help to find more configs to use.
+ */
+public class HoodieDropPartitionsTool implements Serializable {
+
+ private static final Logger LOG = LogManager.getLogger(HoodieDropPartitionsTool.class);
+ // Spark context
+ private final transient JavaSparkContext jsc;
+ // config
+ private final Config cfg;
+ // Properties with source, hoodie client, key generator etc.
+ private TypedProperties props;
+
+ private final HoodieTableMetaClient metaClient;
+
+ public HoodieDropPartitionsTool(JavaSparkContext jsc, Config cfg) {
+ this.jsc = jsc;
+ this.cfg = cfg;
+
+ this.props = cfg.propsFilePath == null
+ ? UtilHelpers.buildProperties(cfg.configs)
+ : readConfigFromFileSystem(jsc, cfg);
+ this.metaClient = HoodieTableMetaClient.builder()
+ .setConf(jsc.hadoopConfiguration()).setBasePath(cfg.basePath)
+ .setLoadActiveTimelineOnLoad(true)
+ .build();
+ }
+
+ /**
+ * Reads config from the file system.
+ *
+ * @param jsc {@link JavaSparkContext} instance.
+ * @param cfg {@link Config} instance.
+ * @return the {@link TypedProperties} instance.
+ */
+ private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) {
+ return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs)
+ .getProps(true);
+ }
+
+ public enum Mode {
+ // Mask/Tombstone these partitions and corresponding data files and let cleaner delete these files later.
+ DELETE,
+ // Dry run by looking for the table partitions and corresponding data files which will be deleted.
+ DRY_RUN
+ }
+
+ public static class Config implements Serializable {
+ @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true)
+ public String basePath = null;
+ @Parameter(names = {"--mode", "-m"}, description = "Set job mode: "
+ + "Set \"delete\" means mask/tombstone these partitions and corresponding data files table partitions and let cleaner delete these files later;"
+ + "Set \"dry_run\" means only looking for the table partitions will be deleted and corresponding data files.", required = true)
+ public String runningMode = null;
+ @Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true)
+ public String tableName = null;
+ @Parameter(names = {"--partitions", "-p"}, description = "Comma separated list of partitions to delete.", required = true)
+ public String partitions = null;
+ @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert/upsert/delete", required = false)
+ public int parallelism = 1500;
+ @Parameter(names = {"--instant-time", "-it"}, description = "instant time for delete table partitions operation.", required = false)
+ public String instantTime = null;
+ @Parameter(names = {"--sync-hive-meta", "-sync"}, description = "Sync information to HMS.", required = false)
+ public boolean syncToHive = false;
+ @Parameter(names = {"--hive-database", "-db"}, description = "Database to sync to.", required = false)
+ public String hiveDataBase = null;
+ @Parameter(names = {"--hive-table-name"}, description = "Table to sync to.", required = false)
+ public String hiveTableName = null;
+ @Parameter(names = {"--hive-user-name", "-user"}, description = "hive user name to use.", required = false)
+ public String hiveUserName = "hive";
+ @Parameter(names = {"--hive-pass-word", "-pass"}, description = "hive password to use.", required = false)
+ public String hivePassWord = "hive";
+ @Parameter(names = {"--hive-jdbc-url", "-jdbc"}, description = "hive url to use.", required = false)
+ public String hiveURL = "jdbc:hive2://localhost:10000";
+ @Parameter(names = {"--hive-partition-field"}, description = "Comma separated list of field in the hive table to use for determining hive partition columns.", required = false)
+ public String hivePartitionsField = "";
+ @Parameter(names = {"--hive-sync-use-jdbc"}, description = "Use JDBC when hive synchronization.", required = false)
+ public boolean hiveUseJdbc = true;
+ @Parameter(names = {"--hive-metastore-uris"}, description = "hive meta store uris to use.", required = false)
+ public String hiveHMSUris = null;
+ @Parameter(names = {"--hive-sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.", required = false)
+ public String hiveSyncMode = "hms";
+ @Parameter(names = {"--hive-sync-ignore-exception"}, description = "Ignore hive sync exception.", required = false)
+ public boolean hiveSyncIgnoreException = false;
+ @Parameter(names = {"--hive-partition-value-extractor-class"}, description = "Class which implements PartitionValueExtractor to extract the partition values,"
+ + " default 'SlashEncodedDayPartitionValueExtractor'.", required = false)
+ public String partitionValueExtractorClass = "org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor";
+ @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false)
+ public String sparkMaster = null;
+ @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false)
+ public String sparkMemory = "1g";
+ @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+ + "hoodie client for deleting table partitions")
+ public String propsFilePath = null;
+ @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+ + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated",
+ splitter = IdentitySplitter.class)
+ public List