diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDropPartitionsTool.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDropPartitionsTool.java deleted file mode 100644 index 50fdf36c81701..0000000000000 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDropPartitionsTool.java +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.utilities; - -import com.beust.jcommander.JCommander; -import com.beust.jcommander.Parameter; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hudi.DataSourceUtils; -import org.apache.hudi.DataSourceWriteOptions; -import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hive.HiveSyncConfig; -import org.apache.hudi.hive.HiveSyncTool; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.table.HoodieSparkTable; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -import scala.Tuple2; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; - -/** - * A tool with spark-submit to drop Hudi table partitions. - * - *
- * You can dry run this tool with the following command to look and print for the table partitions and corresponding data files which will be deleted. - * ``` - * spark-submit \ - * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \ - * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ - * --master local[*] - * --driver-memory 1g \ - * --executor-memory 1g \ - * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ - * --base-path basePath \ - * --table-name tableName \ - * --mode dry_run \ - * --partitions partition1,partition2 - * ``` - * - *
- * - * You can delete the table partitions with '--mode delete' - * - * - DELETE ("delete"): This tool will mask/tombstone these partitions and corresponding data files and let cleaner delete these files later. - * - Also you can set --sync-hive-meta to sync current drop partition into hive - *
- * Example command:
- * ```
- * spark-submit \
- * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \
- * --packages org.apache.spark:spark-avro_2.11:2.4.4 \
- * --master local[*]
- * --driver-memory 1g \
- * --executor-memory 1g \
- * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \
- * --base-path basePath \
- * --table-name tableName \
- * --mode delete \
- * --partitions partition1,partition2
- * ```
- *
- * Also you can use --help to find more configs to use.
- */
-public class HoodieDropPartitionsTool implements Serializable {
-
- private static final Logger LOG = LogManager.getLogger(HoodieDropPartitionsTool.class);
- // Spark context
- private final transient JavaSparkContext jsc;
- // config
- private final Config cfg;
- // Properties with source, hoodie client, key generator etc.
- private TypedProperties props;
-
- private final HoodieTableMetaClient metaClient;
-
- public HoodieDropPartitionsTool(JavaSparkContext jsc, Config cfg) {
- this.jsc = jsc;
- this.cfg = cfg;
-
- this.props = cfg.propsFilePath == null
- ? UtilHelpers.buildProperties(cfg.configs)
- : readConfigFromFileSystem(jsc, cfg);
- this.metaClient = HoodieTableMetaClient.builder()
- .setConf(jsc.hadoopConfiguration()).setBasePath(cfg.basePath)
- .setLoadActiveTimelineOnLoad(true)
- .build();
- }
-
- /**
- * Reads config from the file system.
- *
- * @param jsc {@link JavaSparkContext} instance.
- * @param cfg {@link Config} instance.
- * @return the {@link TypedProperties} instance.
- */
- private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) {
- return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs)
- .getProps(true);
- }
-
- public enum Mode {
- // Mask/Tombstone these partitions and corresponding data files and let cleaner delete these files later.
- DELETE,
- // Dry run by looking for the table partitions and corresponding data files which will be deleted.
- DRY_RUN
- }
-
- public static class Config implements Serializable {
- @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true)
- public String basePath = null;
- @Parameter(names = {"--mode", "-m"}, description = "Set job mode: "
- + "Set \"delete\" means mask/tombstone these partitions and corresponding data files table partitions and let cleaner delete these files later;"
- + "Set \"dry_run\" means only looking for the table partitions will be deleted and corresponding data files.", required = true)
- public String runningMode = null;
- @Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true)
- public String tableName = null;
- @Parameter(names = {"--partitions", "-p"}, description = "Comma separated list of partitions to delete.", required = true)
- public String partitions = null;
- @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert/upsert/delete", required = false)
- public int parallelism = 1500;
- @Parameter(names = {"--instant-time", "-it"}, description = "instant time for delete table partitions operation.", required = false)
- public String instantTime = null;
- @Parameter(names = {"--sync-hive-meta", "-sync"}, description = "Sync information to HMS.", required = false)
- public boolean syncToHive = false;
- @Parameter(names = {"--hive-database", "-db"}, description = "Database to sync to.", required = false)
- public String hiveDataBase = null;
- @Parameter(names = {"--hive-table-name"}, description = "Table to sync to.", required = false)
- public String hiveTableName = null;
- @Parameter(names = {"--hive-user-name", "-user"}, description = "hive user name to use.", required = false)
- public String hiveUserName = "hive";
- @Parameter(names = {"--hive-pass-word", "-pass"}, description = "hive password to use.", required = false)
- public String hivePassWord = "hive";
- @Parameter(names = {"--hive-jdbc-url", "-jdbc"}, description = "hive url to use.", required = false)
- public String hiveURL = "jdbc:hive2://localhost:10000";
- @Parameter(names = {"--hive-partition-field"}, description = "Comma separated list of field in the hive table to use for determining hive partition columns.", required = false)
- public String hivePartitionsField = "";
- @Parameter(names = {"--hive-sync-use-jdbc"}, description = "Use JDBC when hive synchronization.", required = false)
- public boolean hiveUseJdbc = true;
- @Parameter(names = {"--hive-metastore-uris"}, description = "hive meta store uris to use.", required = false)
- public String hiveHMSUris = null;
- @Parameter(names = {"--hive-sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.", required = false)
- public String hiveSyncMode = "hms";
- @Parameter(names = {"--hive-sync-ignore-exception"}, description = "Ignore hive sync exception.", required = false)
- public boolean hiveSyncIgnoreException = false;
- @Parameter(names = {"--hive-partition-value-extractor-class"}, description = "Class which implements PartitionValueExtractor to extract the partition values,"
- + " default 'SlashEncodedDayPartitionValueExtractor'.", required = false)
- public String partitionValueExtractorClass = "org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor";
- @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false)
- public String sparkMaster = null;
- @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false)
- public String sparkMemory = "1g";
- @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
- + "hoodie client for deleting table partitions")
- public String propsFilePath = null;
- @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
- + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated",
- splitter = IdentitySplitter.class)
- public List