diff --git a/docker/demo/config/test-suite/compact-test.properties b/docker/demo/config/test-suite/compact-test.properties index 2eca88de3a426..a237f8806e298 100644 --- a/docker/demo/config/test-suite/compact-test.properties +++ b/docker/demo/config/test-suite/compact-test.properties @@ -46,5 +46,5 @@ hoodie.datasource.hive_sync.database=testdb hoodie.datasource.hive_sync.table=table1 hoodie.datasource.hive_sync.assume_date_partitioning=false hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path -hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor diff --git a/docker/demo/config/test-suite/templates/test.properties.template b/docker/demo/config/test-suite/templates/test.properties.template index e1b65fb730a18..a6c0b2174ff81 100644 --- a/docker/demo/config/test-suite/templates/test.properties.template +++ b/docker/demo/config/test-suite/templates/test.properties.template @@ -46,5 +46,5 @@ hoodie.datasource.hive_sync.database=testdb hoodie.datasource.hive_sync.table=table1 hoodie.datasource.hive_sync.assume_date_partitioning=false hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path -hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor diff --git a/docker/demo/config/test-suite/test.properties b/docker/demo/config/test-suite/test.properties index 30cd1c1f02f09..81a9b2fd09552 100644 --- a/docker/demo/config/test-suite/test.properties +++ b/docker/demo/config/test-suite/test.properties @@ -47,5 +47,5 @@ hoodie.datasource.hive_sync.database=testdb hoodie.datasource.hive_sync.table=table1 hoodie.datasource.hive_sync.assume_date_partitioning=false hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path -hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor diff --git a/docker/demo/sparksql-incremental.commands b/docker/demo/sparksql-incremental.commands index da61347ec275b..8f5017f1f873e 100644 --- a/docker/demo/sparksql-incremental.commands +++ b/docker/demo/sparksql-incremental.commands @@ -21,7 +21,9 @@ import org.apache.hudi.DataSourceWriteOptions; import org.apache.spark.sql.SaveMode; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.HoodieDataSourceHelpers; -import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.sync.common.HoodieSyncConfig; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; import org.apache.hadoop.fs.FileSystem; val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) @@ -43,14 +45,14 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), "datestr"). option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "ts"). option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor"). - option(DataSourceWriteOptions.HIVE_TABLE.key(), "stock_ticks_derived_mor"). - option(DataSourceWriteOptions.HIVE_DATABASE.key(), "default"). - option(DataSourceWriteOptions.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). - option(DataSourceWriteOptions.HIVE_USER.key(), "hive"). - option(DataSourceWriteOptions.HIVE_PASS.key(), "hive"). - option(DataSourceWriteOptions.HIVE_SYNC_ENABLED.key(), "true"). - option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS.key(), "datestr"). - option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). + option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor"). + option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default"). + option(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). + option(HiveSyncConfig.HIVE_USER.key(), "hive"). + option(HiveSyncConfig.HIVE_PASS.key(), "hive"). + option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"). + option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"). + option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true"). mode(SaveMode.Overwrite). save("/user/hive/warehouse/stock_ticks_derived_mor"); @@ -75,14 +77,14 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), "datestr"). option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "ts"). option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor_bs"). - option(DataSourceWriteOptions.HIVE_TABLE.key(), "stock_ticks_derived_mor_bs"). - option(DataSourceWriteOptions.HIVE_DATABASE.key(), "default"). - option(DataSourceWriteOptions.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). - option(DataSourceWriteOptions.HIVE_USER.key(), "hive"). - option(DataSourceWriteOptions.HIVE_PASS.key(), "hive"). - option(DataSourceWriteOptions.HIVE_SYNC_ENABLED.key(), "true"). - option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS.key(), "datestr"). - option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). + option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor_bs"). + option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default"). + option(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). + option(HiveSyncConfig.HIVE_USER.key(), "hive"). + option(HiveSyncConfig.HIVE_PASS.key(), "hive"). + option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"). + option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"). + option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true"). mode(SaveMode.Overwrite). save("/user/hive/warehouse/stock_ticks_derived_mor_bs"); diff --git a/hudi-aws/pom.xml b/hudi-aws/pom.xml index d44a389a61f66..f2b40524bf62e 100644 --- a/hudi-aws/pom.xml +++ b/hudi-aws/pom.xml @@ -41,6 +41,12 @@ ${project.version} + + org.apache.hudi + hudi-hive-sync + ${project.version} + + log4j @@ -75,6 +81,28 @@ ${dynamodb.lockclient.version} + + + ${hive.groupid} + hive-service + ${hive.version} + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + + org.apache.parquet + parquet-avro + + com.amazonaws @@ -104,6 +132,13 @@ metrics-core + + + com.amazonaws + aws-java-sdk-glue + ${aws.sdk.version} + + org.junit.jupiter diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogClient.java b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogClient.java new file mode 100644 index 0000000000000..a2c919416ca99 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogClient.java @@ -0,0 +1,485 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.sync; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hive.AbstractHiveSyncHoodieClient; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.hive.HoodieHiveSyncException; +import org.apache.hudi.hive.util.HiveSchemaUtil; +import org.apache.hudi.sync.common.AbstractSyncHoodieClient; +import org.apache.hudi.sync.common.PartitionValueExtractor; + +import com.amazonaws.services.glue.AWSGlue; +import com.amazonaws.services.glue.AWSGlueClientBuilder; +import com.amazonaws.services.glue.model.AlreadyExistsException; +import com.amazonaws.services.glue.model.BatchCreatePartitionRequest; +import com.amazonaws.services.glue.model.BatchCreatePartitionResult; +import com.amazonaws.services.glue.model.BatchUpdatePartitionRequest; +import com.amazonaws.services.glue.model.BatchUpdatePartitionRequestEntry; +import com.amazonaws.services.glue.model.BatchUpdatePartitionResult; +import com.amazonaws.services.glue.model.Column; +import com.amazonaws.services.glue.model.CreateDatabaseRequest; +import com.amazonaws.services.glue.model.CreateDatabaseResult; +import com.amazonaws.services.glue.model.CreateTableRequest; +import com.amazonaws.services.glue.model.CreateTableResult; +import com.amazonaws.services.glue.model.DatabaseInput; +import com.amazonaws.services.glue.model.EntityNotFoundException; +import com.amazonaws.services.glue.model.GetDatabaseRequest; +import com.amazonaws.services.glue.model.GetPartitionsRequest; +import com.amazonaws.services.glue.model.GetPartitionsResult; +import com.amazonaws.services.glue.model.GetTableRequest; +import com.amazonaws.services.glue.model.Partition; +import com.amazonaws.services.glue.model.PartitionInput; +import com.amazonaws.services.glue.model.SerDeInfo; +import com.amazonaws.services.glue.model.StorageDescriptor; +import com.amazonaws.services.glue.model.Table; +import com.amazonaws.services.glue.model.TableInput; +import com.amazonaws.services.glue.model.UpdateTableRequest; +import com.esotericsoftware.minlog.Log; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.schema.MessageType; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * This class implements all the AWS APIs to enable syncing of a Hudi Table with the + * AWS Glue Data Catalog (https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html). + */ +public class AWSGlueCatalogClient extends AbstractHiveSyncHoodieClient { + + private static final Logger LOG = LogManager.getLogger(AWSGlueCatalogClient.class); + private final HoodieTimeline activeTimeline; + private final AWSGlue awsGlueClient; + private final HiveSyncConfig syncConfig; + private final PartitionValueExtractor partitionValueExtractor; + + public AWSGlueCatalogClient(HiveSyncConfig cfg, FileSystem fs) { + super(cfg.basePath, cfg.assumeDatePartitioning, cfg.useFileListingFromMetadata, cfg.withOperationField, fs); + this.awsGlueClient = getGlueClient(); + this.syncConfig = cfg; + activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + try { + this.partitionValueExtractor = + (PartitionValueExtractor) Class.forName(syncConfig.partitionValueExtractorClass).newInstance(); + } catch (Exception e) { + throw new HoodieHiveSyncException( + "Failed to initialize PartitionValueExtractor class " + syncConfig.partitionValueExtractorClass, e); + } + } + + @Override + public boolean databaseExists() { + GetDatabaseRequest request = new GetDatabaseRequest(); + request.setName(syncConfig.databaseName); + try { + return (awsGlueClient.getDatabase(request).getDatabase() != null); + } catch (EntityNotFoundException exception) { + LOG.info("Database " + syncConfig.databaseName, exception); + } catch (Exception exception) { + LOG.error("Failed to check if database exists " + syncConfig.databaseName, exception); + throw new HoodieHiveSyncException("Failed to check if database exists " + syncConfig.databaseName + " in region ", exception); + } + return false; + } + + @Override + public void createDatabase() { + if (!databaseExists()) { + CreateDatabaseRequest request = new CreateDatabaseRequest(); + request.setDatabaseInput(new DatabaseInput().withName(syncConfig.databaseName).withDescription("automatically created by hudi").withParameters(null).withLocationUri(null)); + try { + CreateDatabaseResult result = awsGlueClient.createDatabase(request); + LOG.info("Successfully created database in Glue: " + result.toString()); + } catch (AlreadyExistsException exception) { + LOG.warn("Database " + syncConfig.databaseName + " already exists", exception); + } catch (Exception exception) { + LOG.error("Failed to create database " + syncConfig.databaseName, exception); + throw new HoodieHiveSyncException("Failed to create database " + syncConfig.databaseName, exception); + } + } + } + + @Override + public boolean tableExists(String tableName) { + GetTableRequest request = new GetTableRequest() + .withDatabaseName(syncConfig.databaseName) + .withName(tableName); + try { + awsGlueClient.getTable(request); + return true; + } catch (EntityNotFoundException exception) { + LOG.info("Accessing non-existent Glue Table " + tableName + " in database " + syncConfig.databaseName, exception); + } catch (Exception exception) { + String errorMsg = "Fatal error while fetching Glue Table " + tableName + " in database " + syncConfig.databaseName; + LOG.error(errorMsg, exception); + throw new HoodieHiveSyncException(errorMsg, exception); + } + return false; + } + + @Override + public void createTable(String tableName, MessageType storageSchema, + String inputFormatClass, String outputFormatClass, + String serdeClass, Map serdeProperties, + Map tableProperties) { + if (!tableExists(tableName)) { + CreateTableRequest request = new CreateTableRequest(); + Map params = new HashMap<>(); + if (!syncConfig.createManagedTable) { + params.put("EXTERNAL", "TRUE"); + } + for (Map.Entry entry : tableProperties.entrySet()) { + params.put(entry.getKey(), entry.getValue()); + } + + try { + Map mapSchema = HiveSchemaUtil.parquetSchemaToMapSchema(storageSchema, syncConfig.supportTimestamp, false); + + List schemaPartitionKeys = new ArrayList<>(); + List schemaWithoutPartitionKeys = new ArrayList<>(); + for (String key : mapSchema.keySet()) { + String keyType = HiveSchemaUtil.getPartitionKeyType(mapSchema, key); + Column column = new Column().withName(key).withType(keyType.toLowerCase()).withComment(""); + // In Glue, the full schema should exclude the partition keys + if (syncConfig.partitionFields.contains(key)) { + schemaPartitionKeys.add(column); + } else { + schemaWithoutPartitionKeys.add(column); + } + } + + StorageDescriptor storageDescriptor = new StorageDescriptor(); + serdeProperties.put("serialization.format", "1"); + storageDescriptor + .withSerdeInfo(new SerDeInfo().withSerializationLibrary(serdeClass) + .withParameters(serdeProperties)) + .withLocation(syncConfig.basePath.replaceFirst("s3a", "s3")) + .withInputFormat(inputFormatClass) + .withOutputFormat(outputFormatClass) + .withColumns(schemaWithoutPartitionKeys); + + TableInput tableInput = new TableInput(); + tableInput.withName(tableName) + .withTableType(TableType.EXTERNAL_TABLE.toString()) + .withParameters(params) + .withPartitionKeys(schemaPartitionKeys) + .withStorageDescriptor(storageDescriptor) + .withLastAccessTime(new Date(System.currentTimeMillis())) + .withLastAnalyzedTime(new Date(System.currentTimeMillis())); + request.withDatabaseName(syncConfig.databaseName) + .withTableInput(tableInput); + + CreateTableResult result = awsGlueClient.createTable(request); + LOG.info("Successfully created table in Glue: " + result.toString()); + } catch (AlreadyExistsException exception) { + LOG.warn("Table " + tableName + " already exists in database " + syncConfig.databaseName, exception); + } catch (Exception exception) { + LOG.error("Failed to create table " + tableName + " in database " + syncConfig.databaseName, exception); + throw new HoodieHiveSyncException("Failed to create table " + tableName + " in database " + syncConfig.databaseName, exception); + } + } + } + + @Override + public Option getLastCommitTimeSynced(String tableName) { + try { + Table table = getTable(syncConfig.databaseName, tableName); + return Option.of(table.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null)); + } catch (Exception e) { + throw new HoodieHiveSyncException("Failed to get the last commit time synced from the database", e); + } + } + + @Override + public void updateLastCommitTimeSynced(String tableName) { + // Set the last commit time from the Active Time line + String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp(); + try { + updateTableParameters(syncConfig.databaseName, tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced), false); + } catch (Exception e) { + throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e); + } + } + + @Override + public void addPartitionsToTable(String tableName, List partitionsToAdd) { + if (partitionsToAdd.isEmpty()) { + LOG.info("No partitions to add for " + tableName); + return; + } + LOG.info("Adding partitions " + partitionsToAdd.size() + " to table " + tableName); + try { + StorageDescriptor sd = getSd(syncConfig.databaseName, tableName); + List partitionInputs = partitionsToAdd.stream().map(partition -> { + StorageDescriptor partitionSd = sd.clone(); + String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString(); + List partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); + partitionSd.setLocation(fullPartitionPath); + return new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd); + }).collect(Collectors.toList()); + + BatchCreatePartitionRequest request = new BatchCreatePartitionRequest(); + request.withDatabaseName(syncConfig.databaseName).withTableName(tableName).withPartitionInputList(partitionInputs); + + BatchCreatePartitionResult result = awsGlueClient.batchCreatePartition(request); + if (result.getErrors() != null && !result.getErrors().isEmpty()) { + throw new HoodieHiveSyncException("Fatal error for Add Partitions Failed " + tableName + " with errors: " + result.getErrors().toString()); + } + } catch (Exception e) { + LOG.error(syncConfig.databaseName + "." + tableName + " add partition failed", e); + throw new HoodieHiveSyncException("Fatal error for Add Partitions Failed " + tableName + " in database " + syncConfig.databaseName, e); + } + } + + @Override + public void updatePartitionsToTable(String tableName, List changedPartitions) { + if (changedPartitions.isEmpty()) { + LOG.info("No partitions to change for " + tableName); + return; + } + LOG.info("Changing partitions " + changedPartitions.size() + " on " + tableName); + try { + StorageDescriptor sd = getSd(syncConfig.databaseName, tableName); + List updatePartitionEntries = changedPartitions.stream().map(partition -> { + StorageDescriptor partitionSd = sd.clone(); + String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString(); + List partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); + sd.setLocation(fullPartitionPath); + PartitionInput partitionInput = new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd); + return new BatchUpdatePartitionRequestEntry().withPartitionInput(partitionInput).withPartitionValueList(partitionValues); + }).collect(Collectors.toList()); + + BatchUpdatePartitionRequest request = new BatchUpdatePartitionRequest(); + request.withDatabaseName(syncConfig.databaseName).withTableName(tableName).withEntries(updatePartitionEntries); + + BatchUpdatePartitionResult result = awsGlueClient.batchUpdatePartition(request); + if (result.getErrors() != null && !result.getErrors().isEmpty()) { + LOG.error("Fatal error for Update Partitions Failed " + tableName + " in database " + syncConfig.databaseName + " with errors: " + result.getErrors().toString()); + throw new HoodieHiveSyncException("Fatal error for Update Partitions Failed " + tableName + " with errors: " + result.getErrors().toString()); + } + } catch (Exception e) { + LOG.error("Fatal error for Update Partitions Failed " + tableName + " in database " + syncConfig.databaseName, e); + throw new HoodieHiveSyncException("Fatal error for Update Partitions Failed " + tableName + " in database " + syncConfig.databaseName, e); + } + } + + @Override + public Map getTableSchema(String tableName) { + try { + // GlueMetastoreClient returns partition keys separate from Columns, hence get both and merge to + // get the Schema of the table. + final long start = System.currentTimeMillis(); + Table table = getTable(syncConfig.databaseName, tableName); + Map partitionKeysMap = + table.getPartitionKeys().stream().collect(Collectors.toMap(Column::getName, f -> f.getType().toUpperCase())); + + Map columnsMap = + table.getStorageDescriptor().getColumns().stream().collect(Collectors.toMap(Column::getName, f -> f.getType().toUpperCase())); + + Map schema = new HashMap<>(); + schema.putAll(columnsMap); + schema.putAll(partitionKeysMap); + final long end = System.currentTimeMillis(); + LOG.info(String.format("Time taken to getTableSchema: %s ms", (end - start))); + return schema; + } catch (Exception e) { + throw new HoodieHiveSyncException("Failed to get Glue table schema for : " + tableName + " in database " + syncConfig.databaseName, e); + } + } + + @Override + public void updateSchema(String tableName, MessageType newSchema) { + // ToDo Cascade is set in Hive meta sync, but need to investigate how to configure it for Glue meta + boolean cascade = syncConfig.partitionFields.size() > 0; + try { + Table table = getTable(syncConfig.databaseName, tableName); + StorageDescriptor sd = getSd(syncConfig.databaseName, tableName); + Map mapNewSchema = HiveSchemaUtil.parquetSchemaToMapSchema(newSchema, syncConfig.supportTimestamp, false); + + List newColumns = mapNewSchema.keySet().stream().map(key -> { + String keyType = HiveSchemaUtil.getPartitionKeyType(mapNewSchema, key); + return new Column().withName(key).withType(keyType.toLowerCase()).withComment(""); + }).collect(Collectors.toList()); + sd.setColumns(newColumns); + + TableInput updatedTableInput = new TableInput(); + updatedTableInput.withName(tableName) + .withTableType(table.getTableType()) + .withParameters(table.getParameters()) + .withPartitionKeys(table.getPartitionKeys()) + .withStorageDescriptor(sd) + .withLastAccessTime(new Date(System.currentTimeMillis())) + .withLastAnalyzedTime(new Date(System.currentTimeMillis())); + + UpdateTableRequest request = new UpdateTableRequest(); + request.withDatabaseName(syncConfig.databaseName) + .withTableInput(updatedTableInput); + awsGlueClient.updateTable(request); + } catch (Exception e) { + String errorMsg = "Fatal error for Update Schema for table " + tableName + " in database " + syncConfig.databaseName; + LOG.error(errorMsg, e); + throw new HoodieHiveSyncException(errorMsg, e); + } + } + + /** + * Iterate over the storage partitions and find if there are any new partitions that need to be added or updated. + * Generate a list of PartitionEvent based on the changes required. + */ + @Override + public List getPartitionEvents(String tableName, List partitionStoragePartitions) { + List tablePartitions = scanTablePartitions(syncConfig.databaseName, tableName); + + Map paths = new HashMap<>(); + for (Partition tablePartition : tablePartitions) { + List hivePartitionValues = tablePartition.getValues(); + String fullTablePartitionPath = + Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getStorageDescriptor().getLocation())).toUri().getPath(); + paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath); + } + + List events = new ArrayList<>(); + for (String storagePartition : partitionStoragePartitions) { + Path storagePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition); + String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath(); + // Check if the partition values or if hdfs path is the same + List storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition); + if (!storagePartitionValues.isEmpty()) { + String storageValue = String.join(", ", storagePartitionValues); + if (!paths.containsKey(storageValue)) { + events.add(PartitionEvent.newPartitionAddEvent(storagePartition)); + } else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) { + events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition)); + } + } + } + return events; + } + + /** + * Update the table properties to the table. + */ + @Override + public void updateTableProperties(String tableName, Map tableProperties) { + if (tableProperties == null || tableProperties.isEmpty()) { + return; + } + try { + updateTableParameters(syncConfig.databaseName, tableName, tableProperties, true); + } catch (Exception e) { + String errorMsg = "Failed to update table properties for Glue table: " + tableName + " in database " + syncConfig.databaseName; + Log.error(errorMsg, e); + throw new HoodieHiveSyncException(errorMsg, e); + } + } + + @Override + public void close() { + awsGlueClient.shutdown(); + } + + private enum TableType { + MANAGED_TABLE, + EXTERNAL_TABLE, + VIRTUAL_VIEW, + INDEX_TABLE, + MATERIALIZED_VIEW; + + private TableType() { + } + } + + private Table getTable(String databaseName, String tableName) throws HoodieHiveSyncException { + GetTableRequest request = new GetTableRequest() + .withDatabaseName(databaseName) + .withName(tableName); + try { + return awsGlueClient.getTable(request).getTable(); + } catch (EntityNotFoundException exception) { + String errorMsg = "Accessing non-existent Glue Table " + tableName + " in database " + databaseName; + LOG.error(errorMsg, exception); + throw new HoodieHiveSyncException(errorMsg, exception); + } catch (Exception exception) { + String errorMsg = "Fatal error while fetching Glue Table " + tableName + " in database " + databaseName; + LOG.error(errorMsg, exception); + throw new HoodieHiveSyncException(errorMsg, exception); + } + } + + private List scanTablePartitions(String databaseName, String tableName) { + try { + GetPartitionsRequest request = new GetPartitionsRequest(); + request.withDatabaseName(databaseName).withTableName(tableName); + GetPartitionsResult result = awsGlueClient.getPartitions(request); + return result.getPartitions(); + } catch (Exception exception) { + throw new HoodieHiveSyncException("Fatal error while scanning all table partitions for table " + + tableName + " in database " + databaseName, exception); + } + } + + private StorageDescriptor getSd(String databaseName, String tableName) { + return getTable(databaseName, tableName).getStorageDescriptor(); + } + + private void updateTableParameters(String databaseName, String tableName, Map updatedParams, boolean shouldReplace) { + try { + Table table = getTable(databaseName, tableName); + Map finalParams = new HashMap<>(); + if (!shouldReplace) { + finalParams.putAll(table.getParameters()); + } + finalParams.putAll(updatedParams); + + TableInput updatedTableInput = new TableInput(); + updatedTableInput.withName(tableName) + .withTableType(table.getTableType()) + .withParameters(finalParams) + .withPartitionKeys(table.getPartitionKeys()) + .withStorageDescriptor(table.getStorageDescriptor()) + .withLastAccessTime(new Date(System.currentTimeMillis())) + .withLastAnalyzedTime(new Date(System.currentTimeMillis())); + + UpdateTableRequest request = new UpdateTableRequest(); + request.withDatabaseName(syncConfig.databaseName) + .withTableInput(updatedTableInput); + awsGlueClient.updateTable(request); + } catch (Exception e) { + String errorMsg = "Failed to update the params: " + updatedParams + " for table " + tableName + " in database " + databaseName; + Log.error(errorMsg, e); + throw new HoodieHiveSyncException(errorMsg, e); + } + } + + private static AWSGlue getGlueClient() { + return AWSGlueClientBuilder.standard().build(); + } +} \ No newline at end of file diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AwsGlueCatalogSyncTool.java b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AwsGlueCatalogSyncTool.java new file mode 100644 index 0000000000000..9249e37988ecd --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AwsGlueCatalogSyncTool.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.sync; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.hive.HiveSyncTool; + +import com.beust.jcommander.JCommander; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hive.conf.HiveConf; + +/** + * Currently Experimental. Utility class that implements syncing a Hudi Table with the + * AWS Glue Data Catalog (https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html) + * to enable querying via Glue ETLs, Athena etc. + * + * Extends HiveSyncTool since most logic is similar to Hive syncing, + * expect using a different client {@link AWSGlueCatalogClient} that implements + * the necessary functionality using Glue APIs. + */ +public class AwsGlueCatalogSyncTool extends HiveSyncTool { + + public AwsGlueCatalogSyncTool(TypedProperties props, Configuration conf, FileSystem fs) { + this(new HiveSyncConfig(props), new HiveConf(conf, HiveConf.class), fs); + } + + private AwsGlueCatalogSyncTool(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) { + super(new AWSGlueCatalogClient(hiveSyncConfig, fs), hiveSyncConfig, hiveConf, fs); + } + + public static void main(String[] args) { + // parse the params + final HiveSyncConfig cfg = new HiveSyncConfig(); + JCommander cmd = new JCommander(cfg, null, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + Configuration hadoopConf = new Configuration(); + FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf); + new AwsGlueCatalogSyncTool(cfg, new HiveConf(hadoopConf, HiveConf.class), fs).syncHoodieTable(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index df4b3f6c33a03..cc29a1a3ef715 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -91,7 +91,7 @@ public class HoodieWriteConfig extends HoodieConfig { public static final String DELTASTREAMER_CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; public static final ConfigProperty TBL_NAME = ConfigProperty - .key("hoodie.table.name") + .key(HoodieTableConfig.HOODIE_TABLE_NAME_KEY) .noDefaultValue() .withDocumentation("Table name that will be used for registering with metastores like HMS. Needs to be same across runs."); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java index 9c142ee3aba73..7218a456166ac 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java @@ -139,6 +139,12 @@ public Integer getInt(ConfigProperty configProperty) { return rawValue.map(v -> Integer.parseInt(v.toString())).orElse(null); } + public Integer getIntOrDefault(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Integer.parseInt(v.toString())) + .orElseGet(() -> Integer.parseInt(configProperty.defaultValue().toString())); + } + public Boolean getBoolean(ConfigProperty configProperty) { Option rawValue = getRawValue(configProperty); return rawValue.map(v -> Boolean.parseBoolean(v.toString())).orElse(null); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index e4b60e2ea3854..adf1695cfad47 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -73,9 +73,11 @@ public class HoodieTableConfig extends HoodieConfig { public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; public static final String HOODIE_PROPERTIES_FILE_BACKUP = "hoodie.properties.backup"; + public static final String HOODIE_WRITE_TABLE_NAME_KEY = "hoodie.datasource.write.table.name"; + public static final String HOODIE_TABLE_NAME_KEY = "hoodie.table.name"; public static final ConfigProperty NAME = ConfigProperty - .key("hoodie.table.name") + .key(HOODIE_TABLE_NAME_KEY) .noDefaultValue() .withDocumentation("Table name that will be used for registering with Hive. Needs to be same across runs."); diff --git a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java index 77c3f15e54c45..2233651f0ea2b 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java +++ b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -25,7 +25,7 @@ import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorType; diff --git a/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java b/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java index 1d7111f495c58..e496e4598c887 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java +++ b/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java @@ -23,7 +23,7 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.configuration.FlinkOptions; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.constant.KeyGeneratorType; import org.apache.hudi.util.StreamerUtil; diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java index 5299551fccd38..8ce77a54ef0d0 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java @@ -23,7 +23,7 @@ import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.exception.HoodieValidationException; -import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java index cbdffe360fd2b..1e47d4f7547d8 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java @@ -22,8 +22,8 @@ import org.apache.hudi.common.model.EventTimeAvroPayload; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.exception.HoodieValidationException; -import org.apache.hudi.hive.MultiPartKeysValueExtractor; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java index 9a369bcbd7dd9..0d9839832b719 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java @@ -28,6 +28,7 @@ import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.hudi.integ.testsuite.schema.SchemaUtils; +import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.ReduceFunction; @@ -108,8 +109,8 @@ public void execute(ExecutionContext context, int curItrCount) throws Exception } if (config.isValidateHive()) { - String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key()); - String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key()); + String database = context.getWriterContext().getProps().getString(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key()); + String tableName = context.getWriterContext().getProps().getString(HoodieSyncConfig.META_SYNC_TABLE_NAME.key()); log.warn("Validating hive table with db : " + database + " and table : " + tableName); Dataset cowDf = session.sql("SELECT * FROM " + database + "." + tableName); Dataset trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java index bdde58adb19e6..e744246062895 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java @@ -24,11 +24,13 @@ import java.sql.SQLException; import java.sql.Statement; import org.apache.hudi.DataSourceUtils; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider; +import org.apache.hudi.sync.common.HoodieSyncConfig; /** * A hive query node in the DAG of operations for a workflow. used to perform a hive query with given config. @@ -46,13 +48,16 @@ public HiveQueryNode(DeltaConfig.Config config) { public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { log.info("Executing hive query node {}", this.getName()); this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration()); - HiveSyncConfig hiveSyncConfig = DataSourceUtils - .buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() - .getDeltaSyncService().getDeltaSync().getProps(), - executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() - .getDeltaSyncService().getDeltaSync().getCfg().targetBasePath, - executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() - .getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat); + + TypedProperties properties = new TypedProperties(); + properties.putAll(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getProps()); + properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH, executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getCfg().targetBasePath); + properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat); + HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(properties); + this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser, hiveSyncConfig.hivePass); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java index 85a292c2a2701..a10e4d31fa736 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java @@ -21,6 +21,9 @@ import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hive.service.server.HiveServer2; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.testutils.HiveTestService; import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; @@ -46,12 +49,17 @@ public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IO } public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) { + HiveSyncTool hiveSyncTool; if (this.config.isHiveLocal()) { - writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync() - .syncHive(getLocalHiveServer().getHiveConf()); + hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(), + getLocalHiveServer().getHiveConf(), + FSUtils.getFs(writer.getWriteConfig().getBasePath(), getLocalHiveServer().getHiveConf())); } else { - writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive(); + hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(), + getLocalHiveServer().getHiveConf(), + FSUtils.getFs(writer.getWriteConfig().getBasePath(), writer.getConfiguration())); } + hiveSyncTool.syncHoodieTable(); } public void stopLocalHiveServiceIfNeeded() throws IOException { diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java index c32f44d1c5f20..fdd30b7590f60 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob; import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; @@ -174,10 +175,10 @@ private static TypedProperties getProperties() { // Make path selection test suite specific props.setProperty("hoodie.deltastreamer.source.input.selector", DFSTestSuitePathSelector.class.getName()); // Hive Configs - props.setProperty(DataSourceWriteOptions.HIVE_URL().key(), "jdbc:hive2://127.0.0.1:9999/"); - props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), "testdb1"); - props.setProperty(DataSourceWriteOptions.HIVE_TABLE().key(), "table1"); - props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "datestr"); + props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/"); + props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb1"); + props.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), "table1"); + props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), TimestampBasedKeyGenerator.class.getName()); props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider"); diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java index cca738a70eba6..e3face02c581d 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java @@ -18,7 +18,6 @@ package org.apache.hudi.connect.writers; -import org.apache.hudi.DataSourceUtils; import org.apache.hudi.client.HoodieJavaWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieJavaEngineContext; @@ -31,20 +30,17 @@ import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.connect.transaction.TransactionCoordinator; import org.apache.hudi.connect.utils.KafkaConnectUtils; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hive.HiveSyncConfig; -import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; -import org.apache.hudi.sync.common.AbstractSyncTool; +import org.apache.hudi.sync.common.HoodieSyncConfig; +import org.apache.hudi.sync.common.util.SyncUtilHelpers; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hive.conf.HiveConf; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -53,7 +49,6 @@ import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; /** @@ -166,39 +161,10 @@ private void syncMeta() { Set syncClientToolClasses = new HashSet<>( Arrays.asList(connectConfigs.getMetaSyncClasses().split(","))); if (connectConfigs.isMetaSyncEnabled()) { + FileSystem fs = FSUtils.getFs(tableBasePath, new Configuration()); for (String impl : syncClientToolClasses) { - impl = impl.trim(); - switch (impl) { - case "org.apache.hudi.hive.HiveSyncTool": - syncHive(); - break; - default: - FileSystem fs = FSUtils.getFs(tableBasePath, new Configuration()); - Properties properties = new Properties(); - properties.putAll(connectConfigs.getProps()); - properties.put("basePath", tableBasePath); - AbstractSyncTool syncTool = (AbstractSyncTool) ReflectionUtils.loadClass(impl, new Class[] {Properties.class, FileSystem.class}, properties, fs); - syncTool.syncHoodieTable(); - } + SyncUtilHelpers.createAndSyncHoodieMeta(impl.trim(), connectConfigs.getProps(), hadoopConf, fs, tableBasePath, HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue()); } } } - - private void syncHive() { - HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig( - new TypedProperties(connectConfigs.getProps()), - tableBasePath, - "PARQUET"); - LOG.info("Syncing target hoodie table with hive table(" - + hiveSyncConfig.tableName - + "). Hive metastore URL :" - + hiveSyncConfig.jdbcUrl - + ", basePath :" + tableBasePath); - LOG.info("Hive Sync Conf => " + hiveSyncConfig.toString()); - FileSystem fs = FSUtils.getFs(tableBasePath, hadoopConf); - HiveConf hiveConf = new HiveConf(); - hiveConf.addResource(fs.getConf()); - LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString()); - new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable(); - } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java index b98417ef2b8f4..d7710c63700d8 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -42,8 +42,6 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.exception.TableNotFoundException; -import org.apache.hudi.hive.HiveSyncConfig; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.index.HoodieIndex.IndexType; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.log4j.LogManager; @@ -54,8 +52,6 @@ import org.apache.spark.sql.Row; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -271,42 +267,4 @@ public static JavaRDD dropDuplicates(JavaSparkContext jssc, JavaRD HoodieWriteConfig.newBuilder().withPath(parameters.get("path")).withProps(parameters).build(); return dropDuplicates(jssc, incomingHoodieRecords, writeConfig); } - - public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath, String baseFileFormat) { - checkRequiredProperties(props, Collections.singletonList(DataSourceWriteOptions.HIVE_TABLE().key())); - HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); - hiveSyncConfig.basePath = basePath; - hiveSyncConfig.usePreApacheInputFormat = - props.getBoolean(DataSourceWriteOptions.HIVE_USE_PRE_APACHE_INPUT_FORMAT().key(), - Boolean.parseBoolean(DataSourceWriteOptions.HIVE_USE_PRE_APACHE_INPUT_FORMAT().defaultValue())); - hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE().key(), - DataSourceWriteOptions.HIVE_DATABASE().defaultValue()); - hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE().key()); - hiveSyncConfig.baseFileFormat = baseFileFormat; - hiveSyncConfig.hiveUser = - props.getString(DataSourceWriteOptions.HIVE_USER().key(), DataSourceWriteOptions.HIVE_USER().defaultValue()); - hiveSyncConfig.hivePass = - props.getString(DataSourceWriteOptions.HIVE_PASS().key(), DataSourceWriteOptions.HIVE_PASS().defaultValue()); - hiveSyncConfig.jdbcUrl = - props.getString(DataSourceWriteOptions.HIVE_URL().key(), DataSourceWriteOptions.HIVE_URL().defaultValue()); - hiveSyncConfig.partitionFields = - props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), ",", new ArrayList<>()); - hiveSyncConfig.partitionValueExtractorClass = - props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), - SlashEncodedDayPartitionValueExtractor.class.getName()); - hiveSyncConfig.useJdbc = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_USE_JDBC().key(), - DataSourceWriteOptions.HIVE_USE_JDBC().defaultValue())); - if (props.containsKey(DataSourceWriteOptions.HIVE_SYNC_MODE().key())) { - hiveSyncConfig.syncMode = props.getString(DataSourceWriteOptions.HIVE_SYNC_MODE().key()); - } - hiveSyncConfig.autoCreateDatabase = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE().key(), - DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE().defaultValue())); - hiveSyncConfig.ignoreExceptions = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_IGNORE_EXCEPTIONS().key(), - DataSourceWriteOptions.HIVE_IGNORE_EXCEPTIONS().defaultValue())); - hiveSyncConfig.skipROSuffix = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE().key(), - DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE().defaultValue())); - hiveSyncConfig.supportTimestamp = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SUPPORT_TIMESTAMP_TYPE().key(), - DataSourceWriteOptions.HIVE_SUPPORT_TIMESTAMP_TYPE().defaultValue())); - return hiveSyncConfig; - } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 51bcd882d5d8a..f10f1fdf7eadc 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -25,9 +25,10 @@ import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.common.util.Option import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.hive.util.ConfigUtils -import org.apache.hudi.hive.{HiveStylePartitionValueExtractor, HiveSyncTool, MultiPartKeysValueExtractor, NonPartitionedExtractor, SlashEncodedDayPartitionValueExtractor} +import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.hudi.keygen.constant.KeyGeneratorOptions import org.apache.hudi.keygen.{ComplexKeyGenerator, CustomKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} +import org.apache.hudi.sync.common.{HiveStylePartitionValueExtractor, HoodieSyncConfig, MultiPartKeysValueExtractor, NonPartitionedExtractor, SlashEncodedDayPartitionValueExtractor} import org.apache.log4j.LogManager import org.apache.spark.sql.execution.datasources.{DataSourceUtils => SparkDataSourceUtils} @@ -240,7 +241,7 @@ object DataSourceWriteOptions { } val TABLE_NAME: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.write.table.name") + .key(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY) .noDefaultValue() .withDocumentation("Table name for the datasource write. Also used to register the table into meta stores.") @@ -369,163 +370,6 @@ object DataSourceWriteOptions { + "evolved, this config will upgrade the records to leverage latest table schema(default values will be " + "injected to missing fields). If not, the write batch would fail.") - // HIVE SYNC SPECIFIC CONFIGS - // NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes - // unexpected issues with config getting reset - val HIVE_SYNC_ENABLED: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.enable") - .defaultValue("false") - .withDocumentation("When set to true, register/sync the table to Apache Hive metastore") - - val META_SYNC_ENABLED: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.meta.sync.enable") - .defaultValue("false") - .withDocumentation("") - - val HIVE_DATABASE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.database") - .defaultValue("default") - .withDocumentation("database to sync to") - - val hiveTableOptKeyInferFunc = DataSourceOptionsHelper.scalaFunctionToJavaFunction((p: HoodieConfig) => { - if (p.contains(TABLE_NAME)) { - Option.of(p.getString(TABLE_NAME)) - } else if (p.contains(HoodieWriteConfig.TBL_NAME)) { - Option.of(p.getString(HoodieWriteConfig.TBL_NAME)) - } else { - Option.empty[String]() - } - }) - val HIVE_TABLE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.table") - .defaultValue("unknown") - .withInferFunction(hiveTableOptKeyInferFunc) - .withDocumentation("table to sync to") - - val HIVE_BASE_FILE_FORMAT: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.base_file_format") - .defaultValue("PARQUET") - .withDocumentation("Base file format for the sync.") - - val HIVE_USER: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.username") - .defaultValue("hive") - .withDocumentation("hive user name to use") - - val HIVE_PASS: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.password") - .defaultValue("hive") - .withDocumentation("hive password to use") - - val HIVE_URL: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.jdbcurl") - .defaultValue("jdbc:hive2://localhost:10000") - .withDocumentation("Hive metastore url") - - val hivePartitionFieldsInferFunc = DataSourceOptionsHelper.scalaFunctionToJavaFunction((p: HoodieConfig) => { - if (p.contains(PARTITIONPATH_FIELD)) { - Option.of(p.getString(PARTITIONPATH_FIELD)) - } else { - Option.empty[String]() - } - }) - val HIVE_PARTITION_FIELDS: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.partition_fields") - .defaultValue("") - .withDocumentation("Field in the table to use for determining hive partition columns.") - .withInferFunction(hivePartitionFieldsInferFunc) - - val hivePartitionExtractorInferFunc = DataSourceOptionsHelper.scalaFunctionToJavaFunction((p: HoodieConfig) => { - if (!p.contains(PARTITIONPATH_FIELD)) { - Option.of(classOf[NonPartitionedExtractor].getName) - } else { - val numOfPartFields = p.getString(PARTITIONPATH_FIELD).split(",").length - if (numOfPartFields == 1 && p.contains(HIVE_STYLE_PARTITIONING) && p.getString(HIVE_STYLE_PARTITIONING) == "true") { - Option.of(classOf[HiveStylePartitionValueExtractor].getName) - } else { - Option.of(classOf[MultiPartKeysValueExtractor].getName) - } - } - }) - val HIVE_PARTITION_EXTRACTOR_CLASS: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.partition_extractor_class") - .defaultValue(classOf[SlashEncodedDayPartitionValueExtractor].getCanonicalName) - .withDocumentation("Class which implements PartitionValueExtractor to extract the partition values, " - + "default 'SlashEncodedDayPartitionValueExtractor'.") - .withInferFunction(hivePartitionExtractorInferFunc) - - val HIVE_ASSUME_DATE_PARTITION: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.assume_date_partitioning") - .defaultValue("false") - .withDocumentation("Assume partitioning is yyyy/mm/dd") - - val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.use_pre_apache_input_format") - .defaultValue("false") - .withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. " - + "Use this when you are in the process of migrating from " - + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format") - - /** @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 */ - @Deprecated - val HIVE_USE_JDBC: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.use_jdbc") - .defaultValue("true") - .deprecatedAfter("0.9.0") - .withDocumentation("Use JDBC when hive synchronization is enabled") - - val HIVE_AUTO_CREATE_DATABASE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.auto_create_database") - .defaultValue("true") - .withDocumentation("Auto create hive database if does not exists") - - val HIVE_IGNORE_EXCEPTIONS: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.ignore_exceptions") - .defaultValue("false") - .withDocumentation("") - - val HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.skip_ro_suffix") - .defaultValue("false") - .withDocumentation("Skip the _ro suffix for Read optimized table, when registering") - - val HIVE_SUPPORT_TIMESTAMP_TYPE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.support_timestamp") - .defaultValue("false") - .withDocumentation("‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. " + - "Disabled by default for backward compatibility.") - - val HIVE_TABLE_PROPERTIES: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.table_properties") - .noDefaultValue() - .withDocumentation("Additional properties to store with table.") - - val HIVE_TABLE_SERDE_PROPERTIES: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.serde_properties") - .noDefaultValue() - .withDocumentation("Serde properties to hive table.") - - val HIVE_SYNC_AS_DATA_SOURCE_TABLE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.sync_as_datasource") - .defaultValue("true") - .withDocumentation("") - - // Create table as managed table - val HIVE_CREATE_MANAGED_TABLE: ConfigProperty[Boolean] = ConfigProperty - .key("hoodie.datasource.hive_sync.create_managed_table") - .defaultValue(false) - .withDocumentation("Whether to sync the table as managed table.") - - val HIVE_BATCH_SYNC_PARTITION_NUM: ConfigProperty[Int] = ConfigProperty - .key("hoodie.datasource.hive_sync.batch_num") - .defaultValue(1000) - .withDocumentation("The number of partitions one batch when synchronous partitions to hive.") - - val HIVE_SYNC_MODE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.hive_sync.mode") - .noDefaultValue() - .withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.") - // Async Compaction - Enabled by default for MOR val ASYNC_COMPACT_ENABLE: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.compaction.async.enable") @@ -558,19 +402,19 @@ object DataSourceWriteOptions { /** @deprecated Use {@link HIVE_ASSUME_DATE_PARTITION} and its methods instead */ @Deprecated - val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = HIVE_ASSUME_DATE_PARTITION.key() + val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key() /** @deprecated Use {@link HIVE_USE_PRE_APACHE_INPUT_FORMAT} and its methods instead */ @Deprecated - val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = HIVE_USE_PRE_APACHE_INPUT_FORMAT.key() + val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key() /** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */ @Deprecated - val HIVE_USE_JDBC_OPT_KEY = HIVE_USE_JDBC.key() + val HIVE_USE_JDBC_OPT_KEY = HiveSyncConfig.HIVE_USE_JDBC.key() /** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */ @Deprecated - val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HIVE_AUTO_CREATE_DATABASE.key() + val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key() /** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */ @Deprecated - val HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HIVE_IGNORE_EXCEPTIONS.key() + val HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.key() /** @deprecated Use {@link STREAMING_IGNORE_FAILED_BATCH} and its methods instead */ @Deprecated val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = STREAMING_IGNORE_FAILED_BATCH.key() @@ -585,34 +429,34 @@ object DataSourceWriteOptions { val DEFAULT_META_SYNC_CLIENT_TOOL_CLASS = META_SYNC_CLIENT_TOOL_CLASS_NAME.defaultValue() /** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */ @Deprecated - val HIVE_SYNC_ENABLED_OPT_KEY = HIVE_SYNC_ENABLED.key() + val HIVE_SYNC_ENABLED_OPT_KEY = HiveSyncConfig.HIVE_SYNC_ENABLED.key() /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */ @Deprecated - val META_SYNC_ENABLED_OPT_KEY = META_SYNC_ENABLED.key() + val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_ENABLED.key() /** @deprecated Use {@link HIVE_DATABASE} and its methods instead */ @Deprecated - val HIVE_DATABASE_OPT_KEY = HIVE_DATABASE.key() + val HIVE_DATABASE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key() /** @deprecated Use {@link HIVE_TABLE} and its methods instead */ @Deprecated - val HIVE_TABLE_OPT_KEY = HIVE_TABLE.key() + val HIVE_TABLE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key() /** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */ @Deprecated - val HIVE_BASE_FILE_FORMAT_OPT_KEY = HIVE_BASE_FILE_FORMAT.key() + val HIVE_BASE_FILE_FORMAT_OPT_KEY = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key() /** @deprecated Use {@link HIVE_USER} and its methods instead */ @Deprecated - val HIVE_USER_OPT_KEY = HIVE_USER.key() + val HIVE_USER_OPT_KEY = HiveSyncConfig.HIVE_USER.key() /** @deprecated Use {@link HIVE_PASS} and its methods instead */ @Deprecated - val HIVE_PASS_OPT_KEY = HIVE_PASS.key() + val HIVE_PASS_OPT_KEY = HiveSyncConfig.HIVE_PASS.key() /** @deprecated Use {@link HIVE_URL} and its methods instead */ @Deprecated - val HIVE_URL_OPT_KEY = HIVE_URL.key() + val HIVE_URL_OPT_KEY = HiveSyncConfig.HIVE_URL.key() /** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */ @Deprecated - val HIVE_PARTITION_FIELDS_OPT_KEY = HIVE_PARTITION_FIELDS.key() + val HIVE_PARTITION_FIELDS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key() /** @deprecated Use {@link HIVE_PARTITION_EXTRACTOR_CLASS} and its methods instead */ @Deprecated - val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = HIVE_PARTITION_EXTRACTOR_CLASS.key() + val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key() /** @deprecated Use {@link KEYGENERATOR_CLASS} and its methods instead */ @Deprecated @@ -722,60 +566,60 @@ object DataSourceWriteOptions { /** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */ @Deprecated - val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = HIVE_SYNC_ENABLED.defaultValue() + val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = HiveSyncConfig.HIVE_SYNC_ENABLED.defaultValue() /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */ @Deprecated - val DEFAULT_META_SYNC_ENABLED_OPT_VAL = META_SYNC_ENABLED.defaultValue() + val DEFAULT_META_SYNC_ENABLED_OPT_VAL = HoodieSyncConfig.META_SYNC_ENABLED.defaultValue() /** @deprecated Use {@link HIVE_DATABASE} and its methods instead */ @Deprecated - val DEFAULT_HIVE_DATABASE_OPT_VAL = HIVE_DATABASE.defaultValue() + val DEFAULT_HIVE_DATABASE_OPT_VAL = HoodieSyncConfig.META_SYNC_DATABASE_NAME.defaultValue() /** @deprecated Use {@link HIVE_TABLE} and its methods instead */ @Deprecated - val DEFAULT_HIVE_TABLE_OPT_VAL = HIVE_TABLE.defaultValue() + val DEFAULT_HIVE_TABLE_OPT_VAL = HoodieSyncConfig.META_SYNC_TABLE_NAME.defaultValue() /** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */ @Deprecated - val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = HIVE_BASE_FILE_FORMAT.defaultValue() + val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue() /** @deprecated Use {@link HIVE_USER} and its methods instead */ @Deprecated - val DEFAULT_HIVE_USER_OPT_VAL = HIVE_USER.defaultValue() + val DEFAULT_HIVE_USER_OPT_VAL = HiveSyncConfig.HIVE_USER.defaultValue() /** @deprecated Use {@link HIVE_PASS} and its methods instead */ @Deprecated - val DEFAULT_HIVE_PASS_OPT_VAL = HIVE_PASS.defaultValue() + val DEFAULT_HIVE_PASS_OPT_VAL = HiveSyncConfig.HIVE_PASS.defaultValue() /** @deprecated Use {@link HIVE_URL} and its methods instead */ @Deprecated - val DEFAULT_HIVE_URL_OPT_VAL = HIVE_URL.defaultValue() + val DEFAULT_HIVE_URL_OPT_VAL = HiveSyncConfig.HIVE_URL.defaultValue() /** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */ @Deprecated - val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = HIVE_PARTITION_FIELDS.defaultValue() + val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.defaultValue() /** @deprecated Use {@link HIVE_PARTITION_EXTRACTOR_CLASS} and its methods instead */ @Deprecated - val DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL = HIVE_PARTITION_EXTRACTOR_CLASS.defaultValue() + val DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.defaultValue() /** @deprecated Use {@link HIVE_ASSUME_DATE_PARTITION} and its methods instead */ @Deprecated - val DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL = HIVE_ASSUME_DATE_PARTITION.defaultValue() + val DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.defaultValue() @Deprecated val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false" /** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */ @Deprecated - val DEFAULT_HIVE_USE_JDBC_OPT_VAL = HIVE_USE_JDBC.defaultValue() + val DEFAULT_HIVE_USE_JDBC_OPT_VAL = HiveSyncConfig.HIVE_USE_JDBC.defaultValue() /** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */ @Deprecated - val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HIVE_AUTO_CREATE_DATABASE.defaultValue() + val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.defaultValue() /** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */ @Deprecated - val DEFAULT_HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HIVE_IGNORE_EXCEPTIONS.defaultValue() + val DEFAULT_HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.defaultValue() /** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */ @Deprecated - val HIVE_SKIP_RO_SUFFIX = HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key() + val HIVE_SKIP_RO_SUFFIX = HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key() /** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */ @Deprecated - val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue() + val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue() /** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */ @Deprecated - val HIVE_SUPPORT_TIMESTAMP = HIVE_SUPPORT_TIMESTAMP_TYPE.key() + val HIVE_SUPPORT_TIMESTAMP = HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key() /** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */ @Deprecated - val DEFAULT_HIVE_SUPPORT_TIMESTAMP = HIVE_SUPPORT_TIMESTAMP_TYPE.defaultValue() + val DEFAULT_HIVE_SUPPORT_TIMESTAMP = HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.defaultValue() /** @deprecated Use {@link ASYNC_COMPACT_ENABLE} and its methods instead */ @Deprecated val ASYNC_COMPACT_ENABLE_OPT_KEY = ASYNC_COMPACT_ENABLE.key() diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 8ebc896fcb2bd..f9a575f366960 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -19,11 +19,8 @@ package org.apache.hudi import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.hive.conf.HiveConf - import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieWriterUtils._ import org.apache.hudi.avro.HoodieAvroUtils @@ -42,11 +39,10 @@ import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.hudi.index.SparkHoodieIndexFactory import org.apache.hudi.internal.DataSourceInternalWriterHelper import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory -import org.apache.hudi.sync.common.AbstractSyncTool +import org.apache.hudi.sync.common.util.SyncUtilHelpers +import org.apache.hudi.sync.common.{AbstractSyncTool, HoodieSyncConfig} import org.apache.hudi.table.BulkInsertPartitioner - import org.apache.log4j.LogManager - import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} @@ -54,8 +50,6 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession} import org.apache.spark.{SPARK_VERSION, SparkContext} -import java.util.Properties - import scala.collection.JavaConversions._ import scala.collection.mutable import scala.collection.mutable.ListBuffer @@ -180,7 +174,7 @@ object HoodieSparkSqlWriter { val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, null, path, tblName, mapAsJavaMap(parameters - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key))) - .asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] + .asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] if (isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration())) { asyncCompactionTriggerFn.get.apply(client) @@ -203,7 +197,7 @@ object HoodieSparkSqlWriter { // Get list of partitions to delete val partitionsToDelete = if (parameters.containsKey(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key())) { val partitionColsToDelete = parameters.get(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key()).get.split(",") - java.util.Arrays.asList(partitionColsToDelete:_*) + java.util.Arrays.asList(partitionColsToDelete: _*) } else { genericRecords.map(gr => keyGenerator.getKey(gr).getPartitionPath).toJavaRDD().distinct().collect() } @@ -288,7 +282,7 @@ object HoodieSparkSqlWriter { } def generateSchemaWithoutPartitionColumns(partitionParam: String, schema: Schema): Schema = { - val fieldsToRemove = new java.util.ArrayList[String]() + val fieldsToRemove = new java.util.ArrayList[String]() partitionParam.split(",").map(partitionField => partitionField.trim) .filter(s => !s.isEmpty).map(field => fieldsToRemove.add(field)) HoodieAvroUtils.removeFields(schema, fieldsToRemove) @@ -323,8 +317,8 @@ object HoodieSparkSqlWriter { latestSchema } - def registerKryoClassesAndGetGenericRecords(tblName: String, sparkContext : SparkContext, df: Dataset[Row], - reconcileSchema: Boolean) : RDD[GenericRecord] = { + def registerKryoClassesAndGetGenericRecords(tblName: String, sparkContext: SparkContext, df: Dataset[Row], + reconcileSchema: Boolean): RDD[GenericRecord] = { val structName = s"${tblName}_record" val nameSpace = s"hoodie.${tblName}" sparkContext.getConf.registerKryoClasses( @@ -441,9 +435,9 @@ object HoodieSparkSqlWriter { } val params = parameters.updated(HoodieWriteConfig.AVRO_SCHEMA_STRING.key, schema.toString) val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path, tblName, mapAsJavaMap(params)) - val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (populateMetaFields) { + val bulkInsertPartitionerRows: BulkInsertPartitioner[Dataset[Row]] = if (populateMetaFields) { val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig) - if (userDefinedBulkInsertPartitionerOpt.isPresent) { + if (userDefinedBulkInsertPartitionerOpt.isPresent) { userDefinedBulkInsertPartitionerOpt.get } else { @@ -484,14 +478,7 @@ object HoodieSparkSqlWriter { + " To use row writer please switch to spark 2 or spark 3") } val hoodieConfig = HoodieWriterUtils.convertMapToHoodieConfig(params) - val hiveSyncEnabled = hoodieConfig.getStringOrDefault(HIVE_SYNC_ENABLED).toBoolean - val metaSyncEnabled = hoodieConfig.getStringOrDefault(META_SYNC_ENABLED).toBoolean - val syncHiveSuccess = - if (hiveSyncEnabled || metaSyncEnabled) { - metaSync(sqlContext.sparkSession, hoodieConfig, basePath, df.schema) - } else { - true - } + val syncHiveSuccess = metaSync(sqlContext.sparkSession, hoodieConfig, basePath, df.schema) (syncHiveSuccess, common.util.Option.ofNullable(instantTime)) } @@ -515,7 +502,7 @@ object HoodieSparkSqlWriter { if (operation != WriteOperationType.DELETE) { if (mode == SaveMode.ErrorIfExists && tableExists) { throw new HoodieException(s"hoodie table at $tablePath already exists.") - } else if (mode == SaveMode.Overwrite && tableExists && operation != WriteOperationType.INSERT_OVERWRITE_TABLE) { + } else if (mode == SaveMode.Overwrite && tableExists && operation != WriteOperationType.INSERT_OVERWRITE_TABLE) { // When user set operation as INSERT_OVERWRITE_TABLE, // overwrite will use INSERT_OVERWRITE_TABLE operator in doWriteOperation log.warn(s"hoodie table at $tablePath already exists. Deleting existing data & overwriting with new data.") @@ -530,80 +517,32 @@ object HoodieSparkSqlWriter { } } - private def syncHive(basePath: Path, fs: FileSystem, hoodieConfig: HoodieConfig, sqlConf: SQLConf): Boolean = { - val hiveSyncConfig: HiveSyncConfig = buildSyncConfig(basePath, hoodieConfig, sqlConf) - val hiveConf: HiveConf = new HiveConf() - hiveConf.addResource(fs.getConf) - new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable() - true - } - - private def buildSyncConfig(basePath: Path, hoodieConfig: HoodieConfig, sqlConf: SQLConf): HiveSyncConfig = { - val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig() - hiveSyncConfig.basePath = basePath.toString - hiveSyncConfig.baseFileFormat = hoodieConfig.getString(HIVE_BASE_FILE_FORMAT) - hiveSyncConfig.usePreApacheInputFormat = - hoodieConfig.getStringOrDefault(HIVE_USE_PRE_APACHE_INPUT_FORMAT).toBoolean - hiveSyncConfig.databaseName = hoodieConfig.getString(HIVE_DATABASE) - hiveSyncConfig.tableName = hoodieConfig.getString(HIVE_TABLE) - hiveSyncConfig.hiveUser = hoodieConfig.getString(HIVE_USER) - hiveSyncConfig.hivePass = hoodieConfig.getString(HIVE_PASS) - hiveSyncConfig.jdbcUrl = hoodieConfig.getString(HIVE_URL) - hiveSyncConfig.skipROSuffix = hoodieConfig.getStringOrDefault(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE, - DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue).toBoolean - hiveSyncConfig.partitionFields = - ListBuffer(hoodieConfig.getString(HIVE_PARTITION_FIELDS).split(",").map(_.trim).filter(!_.isEmpty).toList: _*) - hiveSyncConfig.partitionValueExtractorClass = hoodieConfig.getString(HIVE_PARTITION_EXTRACTOR_CLASS) - hiveSyncConfig.useJdbc = hoodieConfig.getBoolean(HIVE_USE_JDBC) - hiveSyncConfig.useFileListingFromMetadata = hoodieConfig.getBoolean(HoodieMetadataConfig.ENABLE) - hiveSyncConfig.ignoreExceptions = hoodieConfig.getStringOrDefault(HIVE_IGNORE_EXCEPTIONS).toBoolean - hiveSyncConfig.supportTimestamp = hoodieConfig.getStringOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE).toBoolean - hiveSyncConfig.autoCreateDatabase = hoodieConfig.getStringOrDefault(HIVE_AUTO_CREATE_DATABASE).toBoolean - hiveSyncConfig.decodePartition = hoodieConfig.getStringOrDefault(URL_ENCODE_PARTITIONING).toBoolean - hiveSyncConfig.batchSyncNum = hoodieConfig.getStringOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM).toInt - - hiveSyncConfig.syncAsSparkDataSourceTable = hoodieConfig.getStringOrDefault(HIVE_SYNC_AS_DATA_SOURCE_TABLE).toBoolean - hiveSyncConfig.sparkSchemaLengthThreshold = sqlConf.getConf(StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD) - hiveSyncConfig.createManagedTable = hoodieConfig.getBoolean(HIVE_CREATE_MANAGED_TABLE) - hiveSyncConfig.syncMode = hoodieConfig.getString(HIVE_SYNC_MODE) - hiveSyncConfig.serdeProperties = hoodieConfig.getString(HIVE_TABLE_SERDE_PROPERTIES) - hiveSyncConfig.tableProperties = hoodieConfig.getString(HIVE_TABLE_PROPERTIES) - hiveSyncConfig - } - private def metaSync(spark: SparkSession, hoodieConfig: HoodieConfig, basePath: Path, schema: StructType): Boolean = { - val hiveSyncEnabled = hoodieConfig.getStringOrDefault(HIVE_SYNC_ENABLED).toBoolean - var metaSyncEnabled = hoodieConfig.getStringOrDefault(META_SYNC_ENABLED).toBoolean + val hiveSyncEnabled = hoodieConfig.getStringOrDefault(HiveSyncConfig.HIVE_SYNC_ENABLED).toBoolean + var metaSyncEnabled = hoodieConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_ENABLED).toBoolean var syncClientToolClassSet = scala.collection.mutable.Set[String]() - hoodieConfig.getString(META_SYNC_CLIENT_TOOL_CLASS_NAME).split(",").foreach(syncClass => syncClientToolClassSet += syncClass) + hoodieConfig.getString(META_SYNC_CLIENT_TOOL_CLASS_NAME).split(",").foreach(syncClass => syncClientToolClassSet += syncClass) + + var metaSyncSuccess = false + if (!hiveSyncEnabled && !metaSyncEnabled) { + metaSyncSuccess = true + } // for backward compatibility if (hiveSyncEnabled) { metaSyncEnabled = true syncClientToolClassSet += classOf[HiveSyncTool].getName } - var metaSyncSuccess = true if (metaSyncEnabled) { val fs = basePath.getFileSystem(spark.sessionState.newHadoopConf()) + val properties = new TypedProperties() + properties.putAll(hoodieConfig.getProps) + properties.put(HiveSyncConfig.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD, spark.sessionState.conf.getConf(StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD).toString) syncClientToolClassSet.foreach(impl => { - val syncSuccess = impl.trim match { - case "org.apache.hudi.hive.HiveSyncTool" => { - log.info("Syncing to Hive Metastore (URL: " + hoodieConfig.getString(HIVE_URL) + ")") - syncHive(basePath, fs, hoodieConfig, spark.sessionState.conf) - true - } - case _ => { - val properties = new Properties() - properties.putAll(hoodieConfig.getProps) - properties.put("basePath", basePath.toString) - val syncHoodie = ReflectionUtils.loadClass(impl.trim, Array[Class[_]](classOf[Properties], classOf[FileSystem]), properties, fs).asInstanceOf[AbstractSyncTool] - syncHoodie.syncHoodieTable() - true - } - } - metaSyncSuccess = metaSyncSuccess && syncSuccess + SyncUtilHelpers.createAndSyncHoodieMeta(impl.trim, properties, fs.getConf(), fs, basePath.toString, HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue) }) + metaSyncSuccess = true } metaSyncSuccess } @@ -621,8 +560,8 @@ object HoodieSparkSqlWriter { tableConfig: HoodieTableConfig, jsc: JavaSparkContext, tableInstantInfo: TableInstantInfo - ): (Boolean, common.util.Option[java.lang.String], common.util.Option[java.lang.String]) = { - if(writeResult.getWriteStatuses.rdd.filter(ws => ws.hasErrors).isEmpty()) { + ): (Boolean, common.util.Option[java.lang.String], common.util.Option[java.lang.String]) = { + if (writeResult.getWriteStatuses.rdd.filter(ws => ws.hasErrors).isEmpty()) { log.info("Proceeding to commit the write.") val metaMap = parameters.filter(kv => kv._1.startsWith(parameters(COMMIT_METADATA_KEYPREFIX.key))) @@ -668,7 +607,9 @@ object HoodieSparkSqlWriter { } (commitSuccess && metaSyncSuccess, compactionInstant, clusteringInstant) } else { - log.error(s"${tableInstantInfo.operation} failed with errors") + log.error(s"${ + tableInstantInfo.operation + } failed with errors") if (log.isTraceEnabled) { log.trace("Printing out the top 100 errors") writeResult.getWriteStatuses.rdd.filter(ws => ws.hasErrors) @@ -677,7 +618,9 @@ object HoodieSparkSqlWriter { log.trace("Global error :", ws.getGlobalError) if (ws.getErrors.size() > 0) { ws.getErrors.foreach(kt => - log.trace(s"Error for key: ${kt._1}", kt._2)) + log.trace(s"Error for key: ${ + kt._1 + }", kt._2)) } }) } @@ -687,8 +630,10 @@ object HoodieSparkSqlWriter { private def isAsyncCompactionEnabled(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]], tableConfig: HoodieTableConfig, - parameters: Map[String, String], configuration: Configuration) : Boolean = { - log.info(s"Config.inlineCompactionEnabled ? ${client.getConfig.inlineCompactionEnabled}") + parameters: Map[String, String], configuration: Configuration): Boolean = { + log.info(s"Config.inlineCompactionEnabled ? ${ + client.getConfig.inlineCompactionEnabled + }") if (asyncCompactionTriggerFnDefined && !client.getConfig.inlineCompactionEnabled && parameters.get(ASYNC_COMPACT_ENABLE.key).exists(r => r.toBoolean)) { tableConfig.getTableType == HoodieTableType.MERGE_ON_READ @@ -698,8 +643,10 @@ object HoodieSparkSqlWriter { } private def isAsyncClusteringEnabled(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]], - parameters: Map[String, String]) : Boolean = { - log.info(s"Config.asyncClusteringEnabled ? ${client.getConfig.isAsyncClusteringEnabled}") + parameters: Map[String, String]): Boolean = { + log.info(s"Config.asyncClusteringEnabled ? ${ + client.getConfig.isAsyncClusteringEnabled + }") asyncClusteringTriggerFnDefined && client.getConfig.isAsyncClusteringEnabled && parameters.get(ASYNC_CLUSTERING_ENABLE.key).exists(r => r.toBoolean) } @@ -717,7 +664,7 @@ object HoodieSparkSqlWriter { } private def mergeParamsAndGetHoodieConfig(optParams: Map[String, String], - tableConfig: HoodieTableConfig): (Map[String, String], HoodieConfig) = { + tableConfig: HoodieTableConfig): (Map[String, String], HoodieConfig) = { val translatedOptions = DataSourceWriteOptions.translateSqlOptions(optParams) val mergedParams = mutable.Map.empty ++ HoodieWriterUtils.parametersWithWriteDefaults(translatedOptions) if (!mergedParams.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) @@ -725,8 +672,9 @@ object HoodieSparkSqlWriter { mergedParams(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = mergedParams(KEYGENERATOR_CLASS_NAME.key) } if (null != tableConfig) { - tableConfig.getProps.foreach { case (key, value) => - mergedParams(key) = value + tableConfig.getProps.foreach { + case (key, value) => + mergedParams(key) = value } } val params = mergedParams.toMap diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala index 282de54f65259..c6d16f099c96f 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala @@ -18,13 +18,14 @@ package org.apache.hudi import java.util.Properties - import org.apache.hudi.DataSourceOptionsHelper.allAlternatives import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieConfig, TypedProperties} import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.exception.HoodieException +import org.apache.hudi.hive.HiveSyncConfig +import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.spark.sql.SparkSession import org.apache.spark.sql.hudi.command.SqlKeyGenerator @@ -64,20 +65,20 @@ object HoodieWriterUtils { hoodieConfig.setDefaultValue(STREAMING_RETRY_INTERVAL_MS) hoodieConfig.setDefaultValue(STREAMING_IGNORE_FAILED_BATCH) hoodieConfig.setDefaultValue(META_SYNC_CLIENT_TOOL_CLASS_NAME) - hoodieConfig.setDefaultValue(HIVE_SYNC_ENABLED) - hoodieConfig.setDefaultValue(META_SYNC_ENABLED) - hoodieConfig.setDefaultValue(HIVE_DATABASE) - hoodieConfig.setDefaultValue(HIVE_TABLE) - hoodieConfig.setDefaultValue(HIVE_BASE_FILE_FORMAT) - hoodieConfig.setDefaultValue(HIVE_USER) - hoodieConfig.setDefaultValue(HIVE_PASS) - hoodieConfig.setDefaultValue(HIVE_URL) - hoodieConfig.setDefaultValue(HIVE_PARTITION_FIELDS) - hoodieConfig.setDefaultValue(HIVE_PARTITION_EXTRACTOR_CLASS) + hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_SYNC_ENABLED) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_ENABLED) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT) + hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_USER) + hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_PASS) + hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_URL) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS) hoodieConfig.setDefaultValue(HIVE_STYLE_PARTITIONING) - hoodieConfig.setDefaultValue(HIVE_USE_JDBC) - hoodieConfig.setDefaultValue(HIVE_CREATE_MANAGED_TABLE) - hoodieConfig.setDefaultValue(HIVE_SYNC_AS_DATA_SOURCE_TABLE) + hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_USE_JDBC) + hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE) + hoodieConfig.setDefaultValue(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE) hoodieConfig.setDefaultValue(ASYNC_COMPACT_ENABLE) hoodieConfig.setDefaultValue(INLINE_CLUSTERING_ENABLE) hoodieConfig.setDefaultValue(ASYNC_CLUSTERING_ENABLE) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala index ce6237ec99344..a930045d4bd5b 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala @@ -19,11 +19,10 @@ package org.apache.spark.sql.hudi.command import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path - import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.util.ConfigUtils import org.apache.hudi.sql.InsertMode - import org.apache.spark.sql.{Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, HoodieCatalogTable} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} @@ -82,9 +81,9 @@ case class CreateHoodieTableAsSelectCommand( val tblProperties = hoodieCatalogTable.catalogProperties val options = Map( - DataSourceWriteOptions.HIVE_CREATE_MANAGED_TABLE.key -> (table.tableType == CatalogTableType.MANAGED).toString, - DataSourceWriteOptions.HIVE_TABLE_SERDE_PROPERTIES.key -> ConfigUtils.configToString(tblProperties.asJava), - DataSourceWriteOptions.HIVE_TABLE_PROPERTIES.key -> ConfigUtils.configToString(table.properties.asJava), + HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE.key -> (table.tableType == CatalogTableType.MANAGED).toString, + HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key -> ConfigUtils.configToString(tblProperties.asJava), + HiveSyncConfig.HIVE_TABLE_PROPERTIES.key -> ConfigUtils.configToString(table.properties.asJava), DataSourceWriteOptions.SQL_INSERT_MODE.key -> InsertMode.NON_STRICT.value(), DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key -> "true" ) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala index 8475b631c586d..bb876b3076792 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.hudi.command import org.apache.hudi.DataSourceWriteOptions.{OPERATION, _} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME +import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.ddl.HiveSyncMode import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport} - import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable @@ -77,8 +77,8 @@ case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends Runnab SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, OPERATION.key -> DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL, PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, - HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), - HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", + HiveSyncConfig.HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), + HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> "200", SqlKeyGenerator.PARTITION_SCHEMA -> partitionSchema.toDDL ) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala index 4b6d8e06f7489..5ad9082b42762 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala @@ -19,19 +19,18 @@ package org.apache.spark.sql.hudi.command import org.apache.avro.Schema import org.apache.avro.generic.{GenericRecord, IndexedRecord} - import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecord} import org.apache.hudi.common.util.{Option => HOption} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.exception.HoodieDuplicateKeyException -import org.apache.hudi.hive.MultiPartKeysValueExtractor +import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.ddl.HiveSyncMode import org.apache.hudi.keygen.ComplexKeyGenerator import org.apache.hudi.sql.InsertMode +import org.apache.hudi.sync.common.{HoodieSyncConfig, MultiPartKeysValueExtractor} import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkSqlWriter} - import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} @@ -43,7 +42,6 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import java.util.Properties - import scala.collection.JavaConverters._ /** @@ -276,14 +274,14 @@ object InsertIntoHoodieTableCommand extends Logging { PAYLOAD_CLASS_NAME.key -> payloadClassName, ENABLE_ROW_WRITER.key -> enableBulkInsert.toString, HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn), - META_SYNC_ENABLED.key -> enableHive.toString, - HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), - HIVE_USE_JDBC.key -> "false", - HIVE_DATABASE.key -> hoodieCatalogTable.table.identifier.database.getOrElse("default"), - HIVE_TABLE.key -> hoodieCatalogTable.table.identifier.table, - HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HIVE_PARTITION_FIELDS.key -> partitionFields, - HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, + HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, + HiveSyncConfig.HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), + HiveSyncConfig.HIVE_USE_JDBC.key -> "false", + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hoodieCatalogTable.table.identifier.database.getOrElse("default"), + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hoodieCatalogTable.table.identifier.table, + HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFields, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200", HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala index 76c87158684f8..d8fc8a8feef14 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala @@ -22,10 +22,10 @@ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.util.StringUtils import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME -import org.apache.hudi.hive.MultiPartKeysValueExtractor +import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.ddl.HiveSyncMode +import org.apache.hudi.sync.common.{HoodieSyncConfig, MultiPartKeysValueExtractor} import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkSqlWriter, HoodieWriterUtils, SparkAdapterSupport} - import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.Resolver @@ -452,14 +452,14 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning, KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, - META_SYNC_ENABLED.key -> enableHive.toString, - HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), - HIVE_USE_JDBC.key -> "false", - HIVE_DATABASE.key -> targetTableDb, - HIVE_TABLE.key -> targetTableName, - HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HIVE_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, - HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, + HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, + HiveSyncConfig.HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), + HiveSyncConfig.HIVE_USE_JDBC.key -> "false", + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> targetTableDb, + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> targetTableName, + HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200", // set the default parallelism to 200 for sql HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> "200", diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala index 7397b0dad942b..ded7d775a7ad3 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala @@ -22,9 +22,9 @@ import org.apache.hudi.SparkAdapterSupport import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME -import org.apache.hudi.hive.MultiPartKeysValueExtractor +import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.hive.ddl.HiveSyncMode - +import org.apache.hudi.sync.common.{HoodieSyncConfig, MultiPartKeysValueExtractor} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} @@ -106,14 +106,14 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCo SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, OPERATION.key -> UPSERT_OPERATION_OPT_VAL, PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, - META_SYNC_ENABLED.key -> enableHive.toString, - HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), - HIVE_USE_JDBC.key -> "false", - HIVE_DATABASE.key -> tableId.database.getOrElse("default"), - HIVE_TABLE.key -> tableId.table, - HIVE_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, - HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, - HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", + HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString, + HiveSyncConfig.HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), + HiveSyncConfig.HIVE_USE_JDBC.key -> "false", + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> tableId.database.getOrElse("default"), + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> tableId.table, + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, + HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL ) diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java index b6e595c40a8df..f465df8b85902 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java @@ -24,9 +24,10 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.hive.MultiPartKeysValueExtractor; -import org.apache.hudi.hive.NonPartitionedExtractor; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; +import org.apache.hudi.sync.common.NonPartitionedExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.NonpartitionedKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator; @@ -255,24 +256,24 @@ public void run() throws Exception { private DataFrameWriter updateHiveSyncConfig(DataFrameWriter writer) { if (enableHiveSync) { LOG.info("Enabling Hive sync to " + hiveJdbcUrl); - writer = writer.option(DataSourceWriteOptions.HIVE_TABLE().key(), hiveTable) - .option(DataSourceWriteOptions.HIVE_DATABASE().key(), hiveDB) - .option(DataSourceWriteOptions.HIVE_URL().key(), hiveJdbcUrl) - .option(DataSourceWriteOptions.HIVE_USER().key(), hiveUser) - .option(DataSourceWriteOptions.HIVE_PASS().key(), hivePass) - .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED().key(), "true"); + writer = writer.option(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), hiveTable) + .option(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), hiveDB) + .option(HiveSyncConfig.HIVE_URL.key(), hiveJdbcUrl) + .option(HiveSyncConfig.HIVE_USER.key(), hiveUser) + .option(HiveSyncConfig.HIVE_PASS.key(), hivePass) + .option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"); if (nonPartitionedTable) { writer = writer - .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + .option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName()) .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), ""); } else if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "year,month,day").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "dateStr").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "dateStr").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), SlashEncodedDayPartitionValueExtractor.class.getCanonicalName()); } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java index 8302ece4b9ae9..c9e06f5b120b5 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java @@ -23,9 +23,10 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.hive.MultiPartKeysValueExtractor; -import org.apache.hudi.hive.NonPartitionedExtractor; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; +import org.apache.hudi.sync.common.NonPartitionedExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.NonpartitionedKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator; @@ -125,24 +126,25 @@ private HoodieTestDataGenerator getDataGenerate() { private DataFrameWriter updateHiveSyncConfig(DataFrameWriter writer) { if (enableHiveSync) { LOG.info("Enabling Hive sync to " + hiveJdbcUrl); - writer = writer.option(DataSourceWriteOptions.HIVE_TABLE().key(), hiveTable) - .option(DataSourceWriteOptions.HIVE_DATABASE().key(), hiveDB) - .option(DataSourceWriteOptions.HIVE_URL().key(), hiveJdbcUrl) - .option(DataSourceWriteOptions.HIVE_USER().key(), hiveUser) - .option(DataSourceWriteOptions.HIVE_PASS().key(), hivePass) - .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED().key(), "true"); + writer = writer.option(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), hiveTable) + .option(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), hiveDB) + .option(HiveSyncConfig.HIVE_URL.key(), hiveJdbcUrl) + .option(HiveSyncConfig.HIVE_USER.key(), hiveUser) + .option(HiveSyncConfig.HIVE_PASS.key(), hivePass) + .option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"); + if (nonPartitionedTable) { writer = writer - .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + .option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName()) .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), ""); } else if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "year,month,day").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "dateStr").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "dateStr").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), SlashEncodedDayPartitionValueExtractor.class.getCanonicalName()); } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java index f1e6b45b292b7..a071fd0f5fc45 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java @@ -27,8 +27,9 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.TableNotFoundException; -import org.apache.hudi.hive.MultiPartKeysValueExtractor; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; @@ -381,19 +382,19 @@ public void stream(Dataset streamingInput, String operationType, String che private DataStreamWriter updateHiveSyncConfig(DataStreamWriter writer) { if (enableHiveSync) { LOG.info("Enabling Hive sync to " + hiveJdbcUrl); - writer = writer.option(DataSourceWriteOptions.HIVE_TABLE().key(), hiveTable) - .option(DataSourceWriteOptions.HIVE_DATABASE().key(), hiveDB) - .option(DataSourceWriteOptions.HIVE_URL().key(), hiveJdbcUrl) - .option(DataSourceWriteOptions.HIVE_USER().key(), hiveUser) - .option(DataSourceWriteOptions.HIVE_PASS().key(), hivePass) - .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED().key(), "true"); + writer = writer.option(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), hiveTable) + .option(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), hiveDB) + .option(HiveSyncConfig.HIVE_URL.key(), hiveJdbcUrl) + .option(HiveSyncConfig.HIVE_USER.key(), hiveUser) + .option(HiveSyncConfig.HIVE_PASS.key(), hivePass) + .option(HiveSyncConfig.HIVE_SYNC_ENABLED.key(), "true"); if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "year,month,day").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "dateStr").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "dateStr").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), SlashEncodedDayPartitionValueExtractor.class.getCanonicalName()); } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 6353aa2165123..4f404f80b881b 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -20,7 +20,6 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -28,7 +27,6 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; -import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.avro.Conversions; @@ -43,8 +41,6 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; import org.mockito.ArgumentCaptor; import org.mockito.Captor; import org.mockito.Mock; @@ -53,18 +49,13 @@ import java.math.BigDecimal; import java.time.LocalDate; -import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; -import static org.apache.hudi.hive.ddl.HiveSyncMode.HMS; import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.times; @@ -211,29 +202,6 @@ public void testCreateRDDCustomColumnsSortPartitionerWithValidPartitioner() thro assertThat(partitioner.isPresent(), is(true)); } - @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testBuildHiveSyncConfig(boolean useSyncMode) { - TypedProperties props = new TypedProperties(); - if (useSyncMode) { - props.setProperty(DataSourceWriteOptions.HIVE_SYNC_MODE().key(), HMS.name()); - props.setProperty(DataSourceWriteOptions.HIVE_USE_JDBC().key(), String.valueOf(false)); - } - props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), HIVE_DATABASE); - props.setProperty(DataSourceWriteOptions.HIVE_TABLE().key(), HIVE_TABLE); - HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, config.getBasePath(), PARQUET.name()); - - if (useSyncMode) { - assertFalse(hiveSyncConfig.useJdbc); - assertEquals(HMS.name(), hiveSyncConfig.syncMode); - } else { - assertTrue(hiveSyncConfig.useJdbc); - assertNull(hiveSyncConfig.syncMode); - } - assertEquals(HIVE_DATABASE, hiveSyncConfig.databaseName); - assertEquals(HIVE_TABLE, hiveSyncConfig.tableName); - } - private void setAndVerifyHoodieWriteClientWith(final String partitionerClassName) { config = HoodieWriteConfig.newBuilder().withPath(config.getBasePath()) .withUserDefinedBulkInsertPartitionerClass(partitionerClassName) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceOptions.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceOptions.scala index d5c3bfa01fc2e..8026ae260db35 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceOptions.scala @@ -18,8 +18,8 @@ package org.apache.hudi import org.apache.hudi.DataSourceWriteOptions._ -import org.apache.hudi.hive.{HiveStylePartitionValueExtractor, MultiPartKeysValueExtractor} import org.apache.hudi.keygen.{ComplexKeyGenerator, SimpleKeyGenerator} +import org.apache.hudi.sync.common.{HiveStylePartitionValueExtractor, HoodieSyncConfig, MultiPartKeysValueExtractor} import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test @@ -31,10 +31,10 @@ class TestDataSourceOptions { ) val modifiedOptions1 = HoodieWriterUtils.parametersWithWriteDefaults(inputOptions1) assertEquals(classOf[ComplexKeyGenerator].getName, modifiedOptions1(KEYGENERATOR_CLASS_NAME.key)) - assertEquals("hudi_table", modifiedOptions1(HIVE_TABLE.key)) - assertEquals("year,month", modifiedOptions1(HIVE_PARTITION_FIELDS.key)) + assertEquals("hudi_table", modifiedOptions1(HoodieSyncConfig.META_SYNC_TABLE_NAME.key)) + assertEquals("year,month", modifiedOptions1(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key)) assertEquals(classOf[MultiPartKeysValueExtractor].getName, - modifiedOptions1(HIVE_PARTITION_EXTRACTOR_CLASS.key)) + modifiedOptions1(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key)) val inputOptions2 = Map( TABLE_NAME.key -> "hudi_table", @@ -43,9 +43,9 @@ class TestDataSourceOptions { ) val modifiedOptions2 = HoodieWriterUtils.parametersWithWriteDefaults(inputOptions2) assertEquals(classOf[SimpleKeyGenerator].getName, modifiedOptions2(KEYGENERATOR_CLASS_NAME.key)) - assertEquals("hudi_table", modifiedOptions2(HIVE_TABLE.key)) - assertEquals("year", modifiedOptions2(HIVE_PARTITION_FIELDS.key)) + assertEquals("hudi_table", modifiedOptions2(HoodieSyncConfig.META_SYNC_TABLE_NAME.key)) + assertEquals("year", modifiedOptions2(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key)) assertEquals(classOf[HiveStylePartitionValueExtractor].getName, - modifiedOptions2(HIVE_PARTITION_EXTRACTOR_CLASS.key)) + modifiedOptions2(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key)) } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala index fa248e46bd721..bf7e67efe9109 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala @@ -32,6 +32,7 @@ import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode import org.apache.hudi.functional.TestBootstrap import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} +import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.hudi.testutils.DataSourceTestUtils import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext @@ -649,55 +650,6 @@ class TestHoodieSparkSqlWriter { assertEquals(expectedSchema, actualSchema) } - /** - * Test case for build sync config for spark sql. - */ - @Test - def testBuildSyncConfigForSparkSql(): Unit = { - val params = Map( - "path" -> tempBasePath, - DataSourceWriteOptions.TABLE_NAME.key -> "test_hoodie", - DataSourceWriteOptions.HIVE_PARTITION_FIELDS.key -> "partition", - DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key -> "true", - DataSourceWriteOptions.HIVE_CREATE_MANAGED_TABLE.key -> "true" - ) - val parameters = HoodieWriterUtils.parametersWithWriteDefaults(params) - val hoodieConfig = HoodieWriterUtils.convertMapToHoodieConfig(parameters) - - val buildSyncConfigMethod = - HoodieSparkSqlWriter.getClass.getDeclaredMethod("buildSyncConfig", classOf[Path], - classOf[HoodieConfig], classOf[SQLConf]) - buildSyncConfigMethod.setAccessible(true) - - val hiveSyncConfig = buildSyncConfigMethod.invoke(HoodieSparkSqlWriter, - new Path(tempBasePath), hoodieConfig, spark.sessionState.conf).asInstanceOf[HiveSyncConfig] - assertTrue(hiveSyncConfig.skipROSuffix) - assertTrue(hiveSyncConfig.createManagedTable) - assertTrue(hiveSyncConfig.syncAsSparkDataSourceTable) - assertResult(spark.sessionState.conf.getConf(StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD))(hiveSyncConfig.sparkSchemaLengthThreshold) - } - - /** - * Test case for build sync config for skip Ro Suffix values. - */ - @Test - def testBuildSyncConfigForSkipRoSuffixValues(): Unit = { - val params = Map( - "path" -> tempBasePath, - DataSourceWriteOptions.TABLE_NAME.key -> "test_hoodie", - DataSourceWriteOptions.HIVE_PARTITION_FIELDS.key -> "partition" - ) - val parameters = HoodieWriterUtils.parametersWithWriteDefaults(params) - val hoodieConfig = HoodieWriterUtils.convertMapToHoodieConfig(parameters) - val buildSyncConfigMethod = - HoodieSparkSqlWriter.getClass.getDeclaredMethod("buildSyncConfig", classOf[Path], - classOf[HoodieConfig], classOf[SQLConf]) - buildSyncConfigMethod.setAccessible(true) - val hiveSyncConfig = buildSyncConfigMethod.invoke(HoodieSparkSqlWriter, - new Path(tempBasePath), hoodieConfig, spark.sessionState.conf).asInstanceOf[HiveSyncConfig] - assertFalse(hiveSyncConfig.skipROSuffix) - } - /** * Test case for incremental view with replacement. */ diff --git a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java index d4d580fe276af..7b146cfa453a1 100644 --- a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java +++ b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncConfig.java @@ -19,7 +19,7 @@ package org.apache.hudi.dla; import org.apache.hudi.common.config.HoodieMetadataConfig; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import com.beust.jcommander.Parameter; diff --git a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java index bf0369ae2ee58..97838d03ed66b 100644 --- a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java +++ b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java @@ -23,6 +23,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; + +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.util.Option; @@ -41,7 +43,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.stream.Collectors; /** @@ -63,8 +64,8 @@ public class DLASyncTool extends AbstractSyncTool { private final String snapshotTableName; private final Option roTableTableName; - public DLASyncTool(Properties properties, FileSystem fs) { - super(properties, fs); + public DLASyncTool(TypedProperties properties, Configuration conf, FileSystem fs) { + super(properties, conf, fs); this.hoodieDLAClient = new HoodieDLAClient(Utils.propertiesToConfig(properties), fs); this.cfg = Utils.propertiesToConfig(properties); switch (hoodieDLAClient.getTableType()) { @@ -113,7 +114,7 @@ private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat) { LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieDLAClient.getBasePath() + " of type " + hoodieDLAClient.getTableType()); // Check if the necessary table exists - boolean tableExists = hoodieDLAClient.doesTableExist(tableName); + boolean tableExists = hoodieDLAClient.tableExists(tableName); // Get the parquet schema for this table looking at the latest commit MessageType schema = hoodieDLAClient.getDataSchema(); // Sync schema if needed @@ -205,7 +206,8 @@ public static void main(String[] args) { cmd.usage(); System.exit(1); } - FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); - new DLASyncTool(Utils.configToProperties(cfg), fs).syncHoodieTable(); + Configuration hadoopConf = new Configuration(); + FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf); + new DLASyncTool(Utils.configToProperties(cfg), hadoopConf, fs).syncHoodieTable(); } } diff --git a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java index 20f94f01ef0b3..78fea296a5410 100644 --- a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java +++ b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java @@ -27,7 +27,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HoodieHiveSyncException; -import org.apache.hudi.hive.PartitionValueExtractor; +import org.apache.hudi.sync.common.PartitionValueExtractor; import org.apache.hudi.hive.SchemaDifference; import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.sync.common.AbstractSyncHoodieClient; @@ -115,7 +115,7 @@ public void createTable(String tableName, MessageType storageSchema, String inpu } public Map getTableSchema(String tableName) { - if (!doesTableExist(tableName)) { + if (!tableExists(tableName)) { throw new IllegalArgumentException( "Failed to get schema for table " + tableName + " does not exist"); } @@ -221,7 +221,7 @@ private void updateDLASQL(String sql) { } @Override - public boolean doesTableExist(String tableName) { + public boolean tableExists(String tableName) { String sql = consutructShowCreateTableSQL(tableName); Statement stmt = null; ResultSet rs = null; diff --git a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/util/Utils.java b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/util/Utils.java index ad47b71f84948..d1b0dd4e9d56f 100644 --- a/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/util/Utils.java +++ b/hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/util/Utils.java @@ -18,12 +18,12 @@ package org.apache.hudi.dla.util; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.dla.DLASyncConfig; import java.util.ArrayList; import java.util.Arrays; -import java.util.Properties; public class Utils { public static String DLA_DATABASE_OPT_KEY = "hoodie.datasource.dla_sync.database"; @@ -39,8 +39,8 @@ public class Utils { public static String DLA_SKIP_RT_SYNC = "hoodie.datasource.dla_sync.skip_rt_sync"; public static String DLA_SYNC_HIVE_STYLE_PARTITIONING = "hoodie.datasource.dla_sync.hive.style.partitioning"; - public static Properties configToProperties(DLASyncConfig cfg) { - Properties properties = new Properties(); + public static TypedProperties configToProperties(DLASyncConfig cfg) { + TypedProperties properties = new TypedProperties(); properties.put(DLA_DATABASE_OPT_KEY, cfg.databaseName); properties.put(DLA_TABLE_OPT_KEY, cfg.tableName); properties.put(DLA_USER_OPT_KEY, cfg.dlaUser); @@ -54,7 +54,7 @@ public static Properties configToProperties(DLASyncConfig cfg) { return properties; } - public static DLASyncConfig propertiesToConfig(Properties properties) { + public static DLASyncConfig propertiesToConfig(TypedProperties properties) { DLASyncConfig config = new DLASyncConfig(); config.databaseName = properties.getProperty(DLA_DATABASE_OPT_KEY); config.tableName = properties.getProperty(DLA_TABLE_OPT_KEY); diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/AbstractHiveSyncHoodieClient.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/AbstractHiveSyncHoodieClient.java new file mode 100644 index 0000000000000..b5214392e3f49 --- /dev/null +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/AbstractHiveSyncHoodieClient.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hive; + +import org.apache.hudi.sync.common.AbstractSyncHoodieClient; +import org.apache.hudi.sync.common.HoodieSyncException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.schema.MessageType; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * Base class to sync Hudi tables with Hive based metastores, such as Hive server, HMS or managed Hive services. + */ +public abstract class AbstractHiveSyncHoodieClient extends AbstractSyncHoodieClient { + + private static final Logger LOG = LogManager.getLogger(AbstractHiveSyncHoodieClient.class); + + public AbstractHiveSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata, boolean verifyMetadataFileListing, boolean withOperationField, + FileSystem fs) { + super(basePath, assumeDatePartitioning, useFileListingFromMetadata, verifyMetadataFileListing, withOperationField, fs); + } + + public AbstractHiveSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata, boolean withOperationField, FileSystem fs) { + super(basePath, assumeDatePartitioning, useFileListingFromMetadata, withOperationField, fs); + } + + public abstract void createDatabase(); + + /** + * @return true if the configured database exists + */ + public abstract boolean databaseExists(); + + public abstract void updateSchema(String tableName, MessageType newSchema); + + public abstract List getPartitionEvents(String tableName, List writtenPartitionsSince); + + /** + * Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the + * partition path does not match, it updates the partition path). + */ + public boolean syncPartitions(String tableName, List writtenPartitionsSince) { + boolean partitionsChanged; + try { + List partitionEvents = + getPartitionEvents(tableName, writtenPartitionsSince); + List newPartitions = filterPartitions(partitionEvents, AbstractSyncHoodieClient.PartitionEvent.PartitionEventType.ADD); + LOG.info("New Partitions " + newPartitions); + addPartitionsToTable(tableName, newPartitions); + List updatePartitions = filterPartitions(partitionEvents, AbstractSyncHoodieClient.PartitionEvent.PartitionEventType.UPDATE); + LOG.info("Changed Partitions " + updatePartitions); + updatePartitionsToTable(tableName, updatePartitions); + partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty(); + } catch (Exception e) { + throw new HoodieSyncException("Failed to sync partitions for table " + tableName + + " in basepath " + getBasePath(), e); + } + + return partitionsChanged; + } + + private List filterPartitions(List events, AbstractSyncHoodieClient.PartitionEvent.PartitionEventType eventType) { + return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition) + .collect(Collectors.toList()); + } +} + diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java index 9b6385120ff9f..f95528bbafa18 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java @@ -18,27 +18,17 @@ package org.apache.hudi.hive; +import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.sync.common.HoodieSyncConfig; import com.beust.jcommander.Parameter; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; - /** - * Configs needed to sync data into Hive. + * Configs specifically needed to sync data into Hive. */ -public class HiveSyncConfig implements Serializable { - - @Parameter(names = {"--database"}, description = "name of the target database in Hive", required = true) - public String databaseName; - - @Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true) - public String tableName; - - @Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)") - public String baseFileFormat = "PARQUET"; +public class HiveSyncConfig extends HoodieSyncConfig { @Parameter(names = {"--user"}, description = "Hive username") public String hiveUser; @@ -49,45 +39,31 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url") public String jdbcUrl; - @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true) - public String basePath; - - @Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by") - public List partitionFields = new ArrayList<>(); - - @Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor " - + "to extract the partition values from HDFS path") - public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class.getName(); - - @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this" - + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter") - public Boolean assumeDatePartitioning = false; - @Parameter(names = {"--use-pre-apache-input-format"}, description = "Use InputFormat under com.uber.hoodie package " + "instead of org.apache.hudi package. Use this when you are in the process of migrating from " + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to " + "org.apache.hudi input format.") - public Boolean usePreApacheInputFormat = false; + public Boolean usePreApacheInputFormat; @Deprecated @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url") - public Boolean useJdbc = true; + public Boolean useJdbc; @Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql") public String syncMode; @Parameter(names = {"--auto-create-database"}, description = "Auto create hive database") - public Boolean autoCreateDatabase = true; + public Boolean autoCreateDatabase; @Parameter(names = {"--ignore-exceptions"}, description = "Ignore hive exceptions") - public Boolean ignoreExceptions = false; + public Boolean ignoreExceptions; @Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering") - public Boolean skipROSuffix = false; + public Boolean skipROSuffix; @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata") - public Boolean useFileListingFromMetadata = HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; + public Boolean useFileListingFromMetadata; @Parameter(names = {"--table-properties"}, description = "Table properties to hive table") public String tableProperties; @@ -100,28 +76,146 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type." + "Disabled by default for backward compatibility.") - public Boolean supportTimestamp = false; - - @Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing") - public Boolean decodePartition = false; + public Boolean supportTimestamp; @Parameter(names = {"--managed-table"}, description = "Create a managed table") - public Boolean createManagedTable = false; + public Boolean createManagedTable; @Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive") - public Integer batchSyncNum = 1000; + public Integer batchSyncNum; @Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.") - public Boolean syncAsSparkDataSourceTable = true; + public Boolean syncAsSparkDataSourceTable; @Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.") - public int sparkSchemaLengthThreshold = 4000; + public int sparkSchemaLengthThreshold; @Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields") public Boolean withOperationField = false; - @Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.") - public Boolean isConditionalSync = false; + // HIVE SYNC SPECIFIC CONFIGS + // NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes + // unexpected issues with config getting reset + public static final ConfigProperty HIVE_SYNC_ENABLED = ConfigProperty + .key("hoodie.datasource.hive_sync.enable") + .defaultValue("false") + .withDocumentation("When set to true, register/sync the table to Apache Hive metastore."); + + public static final ConfigProperty HIVE_USER = ConfigProperty + .key("hoodie.datasource.hive_sync.username") + .defaultValue("hive") + .withDocumentation("hive user name to use"); + + public static final ConfigProperty HIVE_PASS = ConfigProperty + .key("hoodie.datasource.hive_sync.password") + .defaultValue("hive") + .withDocumentation("hive password to use"); + + public static final ConfigProperty HIVE_URL = ConfigProperty + .key("hoodie.datasource.hive_sync.jdbcurl") + .defaultValue("jdbc:hive2://localhost:10000") + .withDocumentation("Hive metastore url"); + + public static final ConfigProperty HIVE_USE_PRE_APACHE_INPUT_FORMAT = ConfigProperty + .key("hoodie.datasource.hive_sync.use_pre_apache_input_format") + .defaultValue("false") + .withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. " + + "Use this when you are in the process of migrating from " + + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format"); + + /** + * @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 + */ + @Deprecated + public static final ConfigProperty HIVE_USE_JDBC = ConfigProperty + .key("hoodie.datasource.hive_sync.use_jdbc") + .defaultValue("true") + .deprecatedAfter("0.9.0") + .withDocumentation("Use JDBC when hive synchronization is enabled"); + + public static final ConfigProperty HIVE_AUTO_CREATE_DATABASE = ConfigProperty + .key("hoodie.datasource.hive_sync.auto_create_database") + .defaultValue("true") + .withDocumentation("Auto create hive database if does not exists"); + + public static final ConfigProperty HIVE_IGNORE_EXCEPTIONS = ConfigProperty + .key("hoodie.datasource.hive_sync.ignore_exceptions") + .defaultValue("false") + .withDocumentation("Ignore exceptions when syncing with Hive."); + + public static final ConfigProperty HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = ConfigProperty + .key("hoodie.datasource.hive_sync.skip_ro_suffix") + .defaultValue("false") + .withDocumentation("Skip the _ro suffix for Read optimized table, when registering"); + + public static final ConfigProperty HIVE_SUPPORT_TIMESTAMP_TYPE = ConfigProperty + .key("hoodie.datasource.hive_sync.support_timestamp") + .defaultValue("false") + .withDocumentation("‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. " + + "Disabled by default for backward compatibility."); + + public static final ConfigProperty HIVE_TABLE_PROPERTIES = ConfigProperty + .key("hoodie.datasource.hive_sync.table_properties") + .noDefaultValue() + .withDocumentation("Additional properties to store with table."); + + public static final ConfigProperty HIVE_TABLE_SERDE_PROPERTIES = ConfigProperty + .key("hoodie.datasource.hive_sync.serde_properties") + .noDefaultValue() + .withDocumentation("Serde properties to hive table."); + + public static final ConfigProperty HIVE_SYNC_AS_DATA_SOURCE_TABLE = ConfigProperty + .key("hoodie.datasource.hive_sync.sync_as_datasource") + .defaultValue("true") + .withDocumentation(""); + + public static final ConfigProperty HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = ConfigProperty + .key("hoodie.datasource.hive_sync.schema_string_length_thresh") + .defaultValue(4000) + .withDocumentation(""); + + // Create table as managed table + public static final ConfigProperty HIVE_CREATE_MANAGED_TABLE = ConfigProperty + .key("hoodie.datasource.hive_sync.create_managed_table") + .defaultValue(false) + .withDocumentation("Whether to sync the table as managed table."); + + public static final ConfigProperty HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty + .key("hoodie.datasource.hive_sync.batch_num") + .defaultValue(1000) + .withDocumentation("The number of partitions one batch when synchronous partitions to hive."); + + public static final ConfigProperty HIVE_SYNC_MODE = ConfigProperty + .key("hoodie.datasource.hive_sync.mode") + .noDefaultValue() + .withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql."); + + public HiveSyncConfig() { + this(new TypedProperties()); + } + + public HiveSyncConfig(TypedProperties props) { + super(props); + + this.hiveUser = getStringOrDefault(HIVE_USER); + this.hivePass = getStringOrDefault(HIVE_PASS); + this.jdbcUrl = getStringOrDefault(HIVE_URL); + this.usePreApacheInputFormat = getBooleanOrDefault(HIVE_USE_PRE_APACHE_INPUT_FORMAT); + this.useJdbc = getBooleanOrDefault(HIVE_USE_JDBC); + this.syncMode = getString(HIVE_SYNC_MODE); + this.autoCreateDatabase = getBooleanOrDefault(HIVE_AUTO_CREATE_DATABASE); + this.ignoreExceptions = getBooleanOrDefault(HIVE_IGNORE_EXCEPTIONS); + this.skipROSuffix = getBooleanOrDefault(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE); + this.useFileListingFromMetadata = getBooleanOrDefault(HoodieMetadataConfig.ENABLE); + this.tableProperties = getString(HIVE_TABLE_PROPERTIES); + this.serdeProperties = getString(HIVE_TABLE_SERDE_PROPERTIES); + this.supportTimestamp = getBooleanOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE); + this.batchSyncNum = getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM); + this.syncAsSparkDataSourceTable = getBooleanOrDefault(HIVE_SYNC_AS_DATA_SOURCE_TABLE); + this.sparkSchemaLengthThreshold = getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD); + this.createManagedTable = getBooleanOrDefault(HIVE_CREATE_MANAGED_TABLE); + + } // enhance the similar function in child class public static HiveSyncConfig copy(HiveSyncConfig cfg) { @@ -153,32 +247,32 @@ public static HiveSyncConfig copy(HiveSyncConfig cfg) { @Override public String toString() { return "HiveSyncConfig{" - + "databaseName='" + databaseName + '\'' - + ", tableName='" + tableName + '\'' - + ", baseFileFormat='" + baseFileFormat + '\'' - + ", hiveUser='" + hiveUser + '\'' - + ", hivePass='" + hivePass + '\'' - + ", jdbcUrl='" + jdbcUrl + '\'' - + ", basePath='" + basePath + '\'' - + ", partitionFields=" + partitionFields - + ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\'' - + ", assumeDatePartitioning=" + assumeDatePartitioning - + ", usePreApacheInputFormat=" + usePreApacheInputFormat - + ", useJdbc=" + useJdbc - + ", autoCreateDatabase=" + autoCreateDatabase - + ", ignoreExceptions=" + ignoreExceptions - + ", skipROSuffix=" + skipROSuffix - + ", useFileListingFromMetadata=" + useFileListingFromMetadata - + ", tableProperties='" + tableProperties + '\'' - + ", serdeProperties='" + serdeProperties + '\'' - + ", help=" + help - + ", supportTimestamp=" + supportTimestamp - + ", decodePartition=" + decodePartition - + ", createManagedTable=" + createManagedTable - + ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable - + ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold - + ", withOperationField=" + withOperationField - + ", isConditionalSync=" + isConditionalSync - + '}'; + + "databaseName='" + databaseName + '\'' + + ", tableName='" + tableName + '\'' + + ", baseFileFormat='" + baseFileFormat + '\'' + + ", hiveUser='" + hiveUser + '\'' + + ", hivePass='" + hivePass + '\'' + + ", jdbcUrl='" + jdbcUrl + '\'' + + ", basePath='" + basePath + '\'' + + ", partitionFields=" + partitionFields + + ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\'' + + ", assumeDatePartitioning=" + assumeDatePartitioning + + ", usePreApacheInputFormat=" + usePreApacheInputFormat + + ", useJdbc=" + useJdbc + + ", autoCreateDatabase=" + autoCreateDatabase + + ", ignoreExceptions=" + ignoreExceptions + + ", skipROSuffix=" + skipROSuffix + + ", useFileListingFromMetadata=" + useFileListingFromMetadata + + ", tableProperties='" + tableProperties + '\'' + + ", serdeProperties='" + serdeProperties + '\'' + + ", help=" + help + + ", supportTimestamp=" + supportTimestamp + + ", decodePartition=" + decodePartition + + ", createManagedTable=" + createManagedTable + + ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable + + ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold + + ", withOperationField=" + withOperationField + + ", isConditionalSync=" + isConditionalSync + + '}'; } } diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index 3bbaee1ed8bcb..2466b0101e4f7 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -18,6 +18,7 @@ package org.apache.hudi.hive; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieTableType; @@ -29,15 +30,13 @@ import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hudi.hive.util.Parquet2SparkSchemaUtils; -import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent; -import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType; import org.apache.hudi.sync.common.AbstractSyncTool; +import org.apache.hudi.sync.common.NonPartitionedExtractor; import com.beust.jcommander.JCommander; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.parquet.schema.GroupType; @@ -49,7 +48,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; import static org.apache.parquet.schema.OriginalType.UTF8; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; @@ -68,40 +66,39 @@ public class HiveSyncTool extends AbstractSyncTool { public static final String SUFFIX_SNAPSHOT_TABLE = "_rt"; public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro"; - protected final HiveSyncConfig cfg; - protected HoodieHiveClient hoodieHiveClient = null; + protected HiveSyncConfig hiveSyncConfig; + protected AbstractHiveSyncHoodieClient hoodieHiveClient; protected String snapshotTableName = null; protected Option roTableName = null; - public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { - super(configuration.getAllProperties(), fs); + public HiveSyncTool(TypedProperties props, Configuration conf, FileSystem fs) { + this(new HiveSyncConfig(props), new HiveConf(conf, HiveConf.class), fs); + } - try { - this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs); - } catch (RuntimeException e) { - if (cfg.ignoreExceptions) { - LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e); - } else { - throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e); - } - } + public HiveSyncTool(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) { + this(instantiateHiveClient(hiveSyncConfig, hiveConf, fs), hiveSyncConfig, hiveConf, fs); + } + + protected HiveSyncTool(AbstractHiveSyncHoodieClient hoodieHiveClient, HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) { + super(hiveSyncConfig.getProps(), hiveConf, fs); + this.hiveSyncConfig = hiveSyncConfig; + this.hoodieHiveClient = hoodieHiveClient; - this.cfg = cfg; // Set partitionFields to empty, when the NonPartitionedExtractor is used - if (NonPartitionedExtractor.class.getName().equals(cfg.partitionValueExtractorClass)) { + if (NonPartitionedExtractor.class.getName().equals(hiveSyncConfig.partitionValueExtractorClass)) { LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used"); - cfg.partitionFields = new ArrayList<>(); + hiveSyncConfig.partitionFields = new ArrayList<>(); } if (hoodieHiveClient != null) { switch (hoodieHiveClient.getTableType()) { case COPY_ON_WRITE: - this.snapshotTableName = cfg.tableName; + this.snapshotTableName = hiveSyncConfig.tableName; this.roTableName = Option.empty(); break; case MERGE_ON_READ: - this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE; - this.roTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) : - Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE); + this.snapshotTableName = hiveSyncConfig.tableName + SUFFIX_SNAPSHOT_TABLE; + this.roTableName = hiveSyncConfig.skipROSuffix ? Option.of(hiveSyncConfig.tableName) : + Option.of(hiveSyncConfig.tableName + SUFFIX_READ_OPTIMIZED_TABLE); break; default: LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); @@ -110,14 +107,30 @@ public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { } } + private static HoodieHiveClient instantiateHiveClient(HiveSyncConfig hiveSyncConfig, HiveConf hiveConf, FileSystem fs) { + HoodieHiveClient hoodieHiveClient = null; + try { + hoodieHiveClient = new HoodieHiveClient(hiveSyncConfig, hiveConf, fs); + } catch (RuntimeException e) { + if (hiveSyncConfig.ignoreExceptions) { + LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e); + } else { + throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e); + } + } + return hoodieHiveClient; + } + @Override public void syncHoodieTable() { try { if (hoodieHiveClient != null) { + LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" + + hiveSyncConfig.jdbcUrl + ", basePath :" + hiveSyncConfig.basePath); doSync(); } } catch (RuntimeException re) { - throw new HoodieException("Got runtime exception when hive syncing " + cfg.tableName, re); + throw new HoodieException("Got runtime exception when hive syncing " + hiveSyncConfig.tableName, re); } finally { if (hoodieHiveClient != null) { hoodieHiveClient.close(); @@ -143,28 +156,30 @@ protected void doSync() { } protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, - boolean readAsOptimized) { - LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath() + boolean readAsOptimized) { + LOG.info("Trying to sync hoodie db " + hiveSyncConfig.databaseName + + " and table " + tableName + " with base path " + hoodieHiveClient.getBasePath() + " of type " + hoodieHiveClient.getTableType()); // check if the database exists else create it - if (cfg.autoCreateDatabase) { + if (hiveSyncConfig.autoCreateDatabase) { try { - if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { - hoodieHiveClient.createDatabase(cfg.databaseName); + if (!hoodieHiveClient.databaseExists()) { + hoodieHiveClient.createDatabase(); } } catch (Exception e) { // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing LOG.warn("Unable to create database", e); } } else { - if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { - throw new HoodieHiveSyncException("hive database does not exist " + cfg.databaseName); + if (!hoodieHiveClient.databaseExists()) { + LOG.error("Hive database does not exist " + hiveSyncConfig.databaseName); + throw new HoodieHiveSyncException("hive database does not exist " + hiveSyncConfig.databaseName); } } // Check if the necessary table exists - boolean tableExists = hoodieHiveClient.doesTableExist(tableName); + boolean tableExists = hoodieHiveClient.tableExists(tableName); // Get the parquet schema for this table looking at the latest commit MessageType schema = hoodieHiveClient.getDataSchema(); @@ -174,9 +189,9 @@ protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, // by the data source way (which will use the HoodieBootstrapRelation). // TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical. if (hoodieHiveClient.isBootstrap() - && hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ - && !readAsOptimized) { - cfg.syncAsSparkDataSourceTable = false; + && hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ + && !readAsOptimized) { + hiveSyncConfig.syncAsSparkDataSourceTable = false; } // Sync schema if needed boolean schemaChanged = syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema); @@ -192,9 +207,9 @@ protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size()); // Sync the partitions if needed - boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince); + boolean partitionsChanged = hoodieHiveClient.syncPartitions(tableName, writtenPartitionsSince); boolean meetSyncConditions = schemaChanged || partitionsChanged; - if (!cfg.isConditionalSync || meetSyncConditions) { + if (!hiveSyncConfig.isConditionalSync || meetSyncConditions) { hoodieHiveClient.updateLastCommitTimeSynced(tableName); } LOG.info("Sync complete for " + tableName); @@ -205,15 +220,15 @@ protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, * table schema. * * @param tableExists - does table exist - * @param schema - extracted schema + * @param schema - extracted schema */ private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, - boolean readAsOptimized, MessageType schema) { + boolean readAsOptimized, MessageType schema) { // Append spark table properties & serde properties - Map tableProperties = ConfigUtils.toMap(cfg.tableProperties); - Map serdeProperties = ConfigUtils.toMap(cfg.serdeProperties); - if (cfg.syncAsSparkDataSourceTable) { - Map sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema); + Map tableProperties = ConfigUtils.toMap(hiveSyncConfig.tableProperties); + Map serdeProperties = ConfigUtils.toMap(hiveSyncConfig.serdeProperties); + if (hiveSyncConfig.syncAsSparkDataSourceTable) { + Map sparkTableProperties = getSparkTableProperties(hiveSyncConfig.sparkSchemaLengthThreshold, schema); Map sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized); tableProperties.putAll(sparkTableProperties); serdeProperties.putAll(sparkSerdeProperties); @@ -222,10 +237,10 @@ private boolean syncSchema(String tableName, boolean tableExists, boolean useRea // Check and sync schema if (!tableExists) { LOG.info("Hive table " + tableName + " is not found. Creating it"); - HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase()); + HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(hiveSyncConfig.baseFileFormat.toUpperCase()); String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat); - if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) { + if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && hiveSyncConfig.usePreApacheInputFormat) { // Parquet input format had an InputFormat class visible under the old naming scheme. inputFormatClassName = useRealTimeInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() @@ -244,14 +259,14 @@ private boolean syncSchema(String tableName, boolean tableExists, boolean useRea } else { // Check if the table schema has evolved Map tableSchema = hoodieHiveClient.getTableSchema(tableName); - SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp); + SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, hiveSyncConfig.partitionFields, hiveSyncConfig.supportTimestamp); if (!schemaDiff.isEmpty()) { LOG.info("Schema difference found for " + tableName); - hoodieHiveClient.updateTableDefinition(tableName, schema); + hoodieHiveClient.updateSchema(tableName, schema); // Sync the table properties if the schema has changed - if (cfg.tableProperties != null) { + if (hiveSyncConfig.tableProperties != null) { hoodieHiveClient.updateTableProperties(tableName, tableProperties); - LOG.info("Sync table properties for " + tableName + ", table properties is: " + cfg.tableProperties); + LOG.info("Sync table properties for " + tableName + ", table properties is: " + hiveSyncConfig.tableProperties); } schemaChanged = true; } else { @@ -263,15 +278,16 @@ private boolean syncSchema(String tableName, boolean tableExists, boolean useRea /** * Get Spark Sql related table properties. This is used for spark datasource table. - * @param schema The schema to write to the table. + * + * @param schema The schema to write to the table. * @return A new parameters added the spark's table properties. */ - private Map getSparkTableProperties(int schemaLengthThreshold, MessageType schema) { + private Map getSparkTableProperties(int schemaLengthThreshold, MessageType schema) { // Convert the schema and partition info used by spark sql to hive table properties. // The following code refers to the spark code in // https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala GroupType originGroupType = schema.asGroupType(); - List partitionNames = cfg.partitionFields; + List partitionNames = hiveSyncConfig.partitionFields; List partitionCols = new ArrayList<>(); List dataCols = new ArrayList<>(); Map column2Field = new HashMap<>(); @@ -284,7 +300,7 @@ private Map getSparkTableProperties(int schemaLengthThreshold, M // Default the unknown partition fields to be String. // Keep the same logical with HiveSchemaUtil#getPartitionKeyType. partitionCols.add(column2Field.getOrDefault(partitionName, - new PrimitiveType(Type.Repetition.REQUIRED, BINARY, partitionName, UTF8))); + new PrimitiveType(Type.Repetition.REQUIRED, BINARY, partitionName, UTF8))); } for (Type field : originGroupType.getFields()) { @@ -322,39 +338,11 @@ private Map getSparkTableProperties(int schemaLengthThreshold, M private Map getSparkSerdeProperties(boolean readAsOptimized) { Map sparkSerdeProperties = new HashMap<>(); - sparkSerdeProperties.put("path", cfg.basePath); + sparkSerdeProperties.put("path", hiveSyncConfig.basePath); sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized)); return sparkSerdeProperties; } - /** - * Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the - * partition path does not match, it updates the partition path). - */ - private boolean syncPartitions(String tableName, List writtenPartitionsSince) { - boolean partitionsChanged; - try { - List hivePartitions = hoodieHiveClient.scanTablePartitions(tableName); - List partitionEvents = - hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince); - List newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD); - LOG.info("New Partitions " + newPartitions); - hoodieHiveClient.addPartitionsToTable(tableName, newPartitions); - List updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE); - LOG.info("Changed Partitions " + updatePartitions); - hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions); - partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty(); - } catch (Exception e) { - throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e); - } - return partitionsChanged; - } - - private List filterPartitions(List events, PartitionEventType eventType) { - return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition) - .collect(Collectors.toList()); - } - public static void main(String[] args) { // parse the params final HiveSyncConfig cfg = new HiveSyncConfig(); diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java index 265ab750d5aee..d906477af9032 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java @@ -23,12 +23,14 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.hive.util.HiveSchemaUtil; -import org.apache.hudi.sync.common.AbstractSyncHoodieClient; import org.apache.hudi.hive.ddl.DDLExecutor; import org.apache.hudi.hive.ddl.HMSDDLExecutor; import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor; import org.apache.hudi.hive.ddl.HiveSyncMode; import org.apache.hudi.hive.ddl.JDBCExecutor; +import org.apache.hudi.sync.common.AbstractSyncHoodieClient; +import org.apache.hudi.sync.common.PartitionValueExtractor; + import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; @@ -49,7 +51,10 @@ import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP; -public class HoodieHiveClient extends AbstractSyncHoodieClient { +/** + * This class implements logic to sync a Hudi table with either the Hive server or the Hive Metastore. + */ +public class HoodieHiveClient extends AbstractHiveSyncHoodieClient { private static final String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync"; private static final String HIVE_ESCAPE_CHARACTER = HiveSchemaUtil.HIVE_ESCAPE_CHARACTER; @@ -142,45 +147,8 @@ public void updateTableProperties(String tableName, Map tablePro } } - /** - * Iterate over the storage partitions and find if there are any new partitions that need to be added or updated. - * Generate a list of PartitionEvent based on the changes required. - */ - List getPartitionEvents(List tablePartitions, List partitionStoragePartitions) { - Map paths = new HashMap<>(); - for (Partition tablePartition : tablePartitions) { - List hivePartitionValues = tablePartition.getValues(); - String fullTablePartitionPath = - Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getSd().getLocation())).toUri().getPath(); - paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath); - } - - List events = new ArrayList<>(); - for (String storagePartition : partitionStoragePartitions) { - Path storagePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition); - String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath(); - // Check if the partition values or if hdfs path is the same - List storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition); - if (!storagePartitionValues.isEmpty()) { - String storageValue = String.join(", ", storagePartitionValues); - if (!paths.containsKey(storageValue)) { - events.add(PartitionEvent.newPartitionAddEvent(storagePartition)); - } else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) { - events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition)); - } - } - } - return events; - } - - /** - * Scan table partitions. - */ - public List scanTablePartitions(String tableName) throws TException { - return client.listPartitions(syncConfig.databaseName, tableName, (short) -1); - } - - void updateTableDefinition(String tableName, MessageType newSchema) { + @Override + public void updateSchema(String tableName, MessageType newSchema) { ddlExecutor.updateTableDefinition(tableName, newSchema); } @@ -196,7 +164,7 @@ public void createTable(String tableName, MessageType storageSchema, String inpu */ @Override public Map getTableSchema(String tableName) { - if (!doesTableExist(tableName)) { + if (!tableExists(tableName)) { throw new IllegalArgumentException( "Failed to get schema for table " + tableName + " does not exist"); } @@ -207,7 +175,7 @@ public Map getTableSchema(String tableName) { * @return true if the configured table exists */ @Override - public boolean doesTableExist(String tableName) { + public boolean tableExists(String tableName) { try { return client.tableExists(syncConfig.databaseName, tableName); } catch (TException e) { @@ -216,23 +184,23 @@ public boolean doesTableExist(String tableName) { } /** - * @param databaseName * @return true if the configured database exists */ - public boolean doesDataBaseExist(String databaseName) { + public boolean databaseExists() { try { - client.getDatabase(databaseName); + client.getDatabase(syncConfig.databaseName); return true; } catch (NoSuchObjectException noSuchObjectException) { // NoSuchObjectException is thrown when there is no existing database of the name. return false; } catch (TException e) { - throw new HoodieHiveSyncException("Failed to check if database exists " + databaseName, e); + throw new HoodieHiveSyncException("Failed to check if database exists " + syncConfig.databaseName, e); } } - public void createDatabase(String databaseName) { - ddlExecutor.createDatabase(databaseName); + @Override + public void createDatabase() { + ddlExecutor.createDatabase(); } @Override @@ -291,6 +259,7 @@ public void deleteLastReplicatedTimeStamp(String tableName) { } } + @Override public void close() { try { ddlExecutor.close(); @@ -319,4 +288,44 @@ public void updateLastCommitTimeSynced(String tableName) { throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e); } } + + @Override + public List getPartitionEvents(String tableName, List partitionStoragePartitions) { + try { + List tablePartitions = scanTablePartitions(tableName); + Map paths = new HashMap<>(); + for (Partition tablePartition : tablePartitions) { + List hivePartitionValues = tablePartition.getValues(); + String fullTablePartitionPath = + Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getSd().getLocation())).toUri().getPath(); + paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath); + } + + List events = new ArrayList<>(); + for (String storagePartition : partitionStoragePartitions) { + Path storagePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition); + String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath(); + // Check if the partition values or if hdfs path is the same + List storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition); + if (!storagePartitionValues.isEmpty()) { + String storageValue = String.join(", ", storagePartitionValues); + if (!paths.containsKey(storageValue)) { + events.add(PartitionEvent.newPartitionAddEvent(storagePartition)); + } else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) { + events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition)); + } + } + } + return events; + } catch (Exception e) { + throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e); + } + } + + /** + * Scan table partitions. + */ + public List scanTablePartitions(String tableName) throws TException { + return client.listPartitions(syncConfig.databaseName, tableName, (short) -1); + } } diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/DDLExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/DDLExecutor.java index 0e1e223aab551..52d252c3261a4 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/DDLExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/DDLExecutor.java @@ -30,9 +30,9 @@ */ public interface DDLExecutor { /** - * @param databaseName name of database to be created. + * Create a database if it does not exist. */ - public void createDatabase(String databaseName); + void createDatabase(); /** * Creates a table with the following properties. @@ -45,7 +45,7 @@ public interface DDLExecutor { * @param serdeProperties * @param tableProperties */ - public void createTable(String tableName, MessageType storageSchema, String inputFormatClass, + void createTable(String tableName, MessageType storageSchema, String inputFormatClass, String outputFormatClass, String serdeClass, Map serdeProperties, Map tableProperties); @@ -55,7 +55,7 @@ public void createTable(String tableName, MessageType storageSchema, String inpu * @param tableName * @param newSchema */ - public void updateTableDefinition(String tableName, MessageType newSchema); + void updateTableDefinition(String tableName, MessageType newSchema); /** * Fetches tableSchema for a table. @@ -63,7 +63,7 @@ public void createTable(String tableName, MessageType storageSchema, String inpu * @param tableName * @return */ - public Map getTableSchema(String tableName); + Map getTableSchema(String tableName); /** * Adds partition to table. @@ -71,7 +71,7 @@ public void createTable(String tableName, MessageType storageSchema, String inpu * @param tableName * @param partitionsToAdd */ - public void addPartitionsToTable(String tableName, List partitionsToAdd); + void addPartitionsToTable(String tableName, List partitionsToAdd); /** * Updates partitions for a given table. @@ -79,7 +79,7 @@ public void createTable(String tableName, MessageType storageSchema, String inpu * @param tableName * @param changedPartitions */ - public void updatePartitionsToTable(String tableName, List changedPartitions); + void updatePartitionsToTable(String tableName, List changedPartitions); - public void close(); + void close(); } diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java index 37aa54abd33b8..6988927b491c9 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java @@ -23,7 +23,7 @@ import org.apache.hudi.common.fs.StorageSchemes; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HoodieHiveSyncException; -import org.apache.hudi.hive.PartitionValueExtractor; +import org.apache.hudi.sync.common.PartitionValueExtractor; import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hadoop.fs.FileSystem; @@ -77,13 +77,13 @@ public HMSDDLExecutor(HiveConf conf, HiveSyncConfig syncConfig, FileSystem fs) t } @Override - public void createDatabase(String databaseName) { + public void createDatabase() { try { - Database database = new Database(databaseName, "automatically created by hoodie", null, null); + Database database = new Database(syncConfig.databaseName, "automatically created by hoodie", null, null); client.createDatabase(database); } catch (Exception e) { - LOG.error("Failed to create database " + databaseName, e); - throw new HoodieHiveSyncException("Failed to create database " + databaseName, e); + LOG.error("Failed to create database " + syncConfig.databaseName, e); + throw new HoodieHiveSyncException("Failed to create database " + syncConfig.databaseName, e); } } diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/QueryBasedDDLExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/QueryBasedDDLExecutor.java index aed2bbedd45b1..7a522a0e033eb 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/QueryBasedDDLExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/QueryBasedDDLExecutor.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HoodieHiveSyncException; -import org.apache.hudi.hive.PartitionValueExtractor; +import org.apache.hudi.sync.common.PartitionValueExtractor; import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hadoop.fs.FileSystem; @@ -68,8 +68,8 @@ public QueryBasedDDLExecutor(HiveSyncConfig config, FileSystem fs) { public abstract void runSQL(String sql); @Override - public void createDatabase(String databaseName) { - runSQL("create database if not exists " + databaseName); + public void createDatabase() { + runSQL("create database if not exists " + config.databaseName); } @Override diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncTool.java index 1d225cb840c05..cf600c746ed4c 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncTool.java @@ -21,7 +21,9 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.hive.HiveSyncTool; +import org.apache.hudi.hive.HoodieHiveClient; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -35,9 +37,12 @@ public class GlobalHiveSyncTool extends HiveSyncTool { private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class); + private final HoodieHiveClient nativeHiveClient; public GlobalHiveSyncTool(GlobalHiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { super(cfg, configuration, fs); + ValidationUtils.checkArgument((hoodieHiveClient instanceof HoodieHiveClient), "GlobalHiveSyncTool only supports native Hive Sync of type " + HoodieHiveClient.class.getName()); + nativeHiveClient = (HoodieHiveClient) hoodieHiveClient; } @Override @@ -48,23 +53,23 @@ public void syncHoodieTable() { @Override protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean readAsOptimized) { super.syncHoodieTable(tableName, useRealtimeInputFormat, readAsOptimized); - if (((GlobalHiveSyncConfig)cfg).globallyReplicatedTimeStamp != null) { - hoodieHiveClient.updateLastReplicatedTimeStamp(tableName, - ((GlobalHiveSyncConfig) cfg).globallyReplicatedTimeStamp); + if (((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp != null) { + nativeHiveClient.updateLastReplicatedTimeStamp(tableName, + ((GlobalHiveSyncConfig) hiveSyncConfig).globallyReplicatedTimeStamp); } LOG.info("Sync complete for " + tableName); } public void close() { - hoodieHiveClient.close(); + nativeHiveClient.close(); } public Map> getLastReplicatedTimeStampMap() { Map> timeStampMap = new HashMap<>(); - Option timeStamp = hoodieHiveClient.getLastReplicatedTime(snapshotTableName); + Option timeStamp = nativeHiveClient.getLastReplicatedTime(snapshotTableName); timeStampMap.put(snapshotTableName, timeStamp); - if (HoodieTableType.MERGE_ON_READ.equals(hoodieHiveClient.getTableType())) { - Option roTimeStamp = hoodieHiveClient.getLastReplicatedTime(roTableName.get()); + if (HoodieTableType.MERGE_ON_READ.equals(nativeHiveClient.getTableType())) { + Option roTimeStamp = nativeHiveClient.getLastReplicatedTime(roTableName.get()); timeStampMap.put(roTableName.get(), roTimeStamp); } return timeStampMap; @@ -74,10 +79,10 @@ public void setLastReplicatedTimeStamp(Map> timeStampMap) for (String tableName : timeStampMap.keySet()) { Option timestamp = timeStampMap.get(tableName); if (timestamp.isPresent()) { - hoodieHiveClient.updateLastReplicatedTimeStamp(tableName, timestamp.get()); + nativeHiveClient.updateLastReplicatedTimeStamp(tableName, timestamp.get()); LOG.info("updated timestamp for " + tableName + " to: " + timestamp.get()); } else { - hoodieHiveClient.deleteLastReplicatedTimeStamp(tableName); + nativeHiveClient.deleteLastReplicatedTimeStamp(tableName); LOG.info("deleted timestamp for " + tableName); } } diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java index d36727a571deb..6fa1698c2f006 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java @@ -28,6 +28,8 @@ import org.apache.hudi.hive.util.ConfigUtils; import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent; import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; +import org.apache.hudi.sync.common.NonPartitionedExtractor; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; @@ -55,10 +57,12 @@ import static org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor; import static org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem; -import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncConfig; +import static org.apache.hudi.hive.testutils.HiveTestUtil.getHiveConf; +import static org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncProps; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -73,6 +77,7 @@ private static Iterable syncMode() { return SYNC_MODES; } + // useSchemaFromCommitMetadata, syncMode private static Iterable syncModeAndSchemaFromCommitMetadata() { List opts = new ArrayList<>(); for (Object mode : SYNC_MODES) { @@ -82,6 +87,9 @@ private static Iterable syncModeAndSchemaFromCommitMetadata() { return opts; } + private HiveSyncTool hiveSyncTool; + private HoodieHiveClient hiveClient; + @AfterAll public static void cleanUpClass() { HiveTestUtil.shutdown(); @@ -119,115 +127,99 @@ public void teardown() throws Exception { @ParameterizedTest @MethodSource({"syncModeAndSchemaFromCommitMetadata"}) public void testBasicSync(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 3; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); + String instantTime = "100"; HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata); - HoodieHiveClient hiveClient = - new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - assertFalse(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should not exist initially"); + + reinitHiveSyncClient(); + assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - // we need renew the hiveclient after tool.syncHoodieTable(), because it will close hive - // session, then lead to connection retry, we can see there is a exception at log. - hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - assertTrue(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName), - "Table " + HiveTestUtil.hiveSyncConfig.tableName + " should exist after sync completes"); - assertEquals(hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).size(), + reSyncHiveTable(); + + assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); + assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), hiveClient.getDataSchema().getColumns().size() + 1, "Hive Schema should match the table schema + partition field"); - assertEquals(5, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); - assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); // Adding of new partitions List newPartition = Arrays.asList("2050/01/01"); - hiveClient.addPartitionsToTable(hiveSyncConfig.tableName, Arrays.asList()); - assertEquals(5, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList()); + assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "No new partition should be added"); - hiveClient.addPartitionsToTable(hiveSyncConfig.tableName, newPartition); - assertEquals(6, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + hiveClient.addPartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition); + assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "New partition should be added"); // Update partitions - hiveClient.updatePartitionsToTable(hiveSyncConfig.tableName, Arrays.asList()); - assertEquals(6, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, Arrays.asList()); + assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Partition count should remain the same"); - hiveClient.updatePartitionsToTable(hiveSyncConfig.tableName, newPartition); - assertEquals(6, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + hiveClient.updatePartitionsToTable(HiveTestUtil.TABLE_NAME, newPartition); + assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Partition count should remain the same"); // Alter partitions // Manually change a hive partition location to check if the sync will detect // it and generate a partition update event for it. - ddlExecutor.runSQL("ALTER TABLE `" + hiveSyncConfig.tableName + ddlExecutor.runSQL("ALTER TABLE `" + HiveTestUtil.TABLE_NAME + "` PARTITION (`datestr`='2050-01-01') SET LOCATION '/some/new/location'"); - hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - List hivePartitions = hiveClient.scanTablePartitions(hiveSyncConfig.tableName); List writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.empty()); - //writtenPartitionsSince.add(newPartition.get(0)); - List partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince); + List partitionEvents = hiveClient.getPartitionEvents(HiveTestUtil.TABLE_NAME, writtenPartitionsSince); assertEquals(1, partitionEvents.size(), "There should be only one partition event"); assertEquals(PartitionEventType.UPDATE, partitionEvents.iterator().next().eventType, "The one partition event must of type UPDATE"); - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); + // Lets do the sync + reSyncHiveTable(); + // Sync should update the changed partition to correct path - List tablePartitions = hiveClient.scanTablePartitions(hiveSyncConfig.tableName); + List tablePartitions = hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME); assertEquals(6, tablePartitions.size(), "The one partition we wrote should be added to hive"); - assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be 100"); } @ParameterizedTest @MethodSource({"syncMode"}) public void testSyncDataBase(String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 3; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String instantTime = "100"; HiveTestUtil.createCOWTable(instantTime, 5, true); - hiveSyncConfig.databaseName = "database1"; + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), HiveTestUtil.DB_NAME); // while autoCreateDatabase is false and database not exists; - hiveSyncConfig.autoCreateDatabase = false; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "false"); + reinitHiveSyncClient(); // Lets do the sync - assertThrows(Exception.class, () -> { - new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem).syncHoodieTable(); - }); + assertThrows(Exception.class, (this::reSyncHiveTable)); // while autoCreateDatabase is true and database not exists; - hiveSyncConfig.autoCreateDatabase = true; - HoodieHiveClient hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - assertDoesNotThrow(() -> { - new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem).syncHoodieTable(); - }); - assertTrue(hiveClient.doesDataBaseExist(hiveSyncConfig.databaseName), - "DataBases " + hiveSyncConfig.databaseName + " should exist after sync completes"); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "true"); + reinitHiveSyncClient(); + assertDoesNotThrow((this::reSyncHiveTable)); + assertTrue(hiveClient.databaseExists(), + "DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes"); // while autoCreateDatabase is false and database exists; - hiveSyncConfig.autoCreateDatabase = false; - assertDoesNotThrow(() -> { - new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem).syncHoodieTable(); - }); - assertTrue(hiveClient.doesDataBaseExist(hiveSyncConfig.databaseName), - "DataBases " + hiveSyncConfig.databaseName + " should exist after sync completes"); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "false"); + reinitHiveSyncClient(); + assertDoesNotThrow((this::reSyncHiveTable)); + assertTrue(hiveClient.databaseExists(), + "DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes"); // while autoCreateDatabase is true and database exists; - hiveSyncConfig.autoCreateDatabase = true; - assertDoesNotThrow(() -> { - new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem).syncHoodieTable(); - }); - assertTrue(hiveClient.doesDataBaseExist(hiveSyncConfig.databaseName), - "DataBases " + hiveSyncConfig.databaseName + " should exist after sync completes"); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE.key(), "true"); + assertDoesNotThrow((this::reSyncHiveTable)); + assertTrue(hiveClient.databaseExists(), + "DataBases " + HiveTestUtil.DB_NAME + " should exist after sync completes"); } @ParameterizedTest @@ -235,11 +227,9 @@ public void testSyncDataBase(String syncMode) throws Exception { public void testSyncCOWTableWithProperties(boolean useSchemaFromCommitMetadata, boolean syncAsDataSourceTable, String syncMode) throws Exception { - HiveSyncConfig hiveSyncConfig = HiveTestUtil.hiveSyncConfig; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 3; Map serdeProperties = new HashMap() { { - put("path", hiveSyncConfig.basePath); + put("path", HiveTestUtil.basePath); } }; @@ -249,20 +239,20 @@ public void testSyncCOWTableWithProperties(boolean useSchemaFromCommitMetadata, put("tp_1", "p1"); } }; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable)); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties)); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties)); - hiveSyncConfig.syncMode = syncMode; - hiveSyncConfig.syncAsSparkDataSourceTable = syncAsDataSourceTable; - hiveSyncConfig.serdeProperties = ConfigUtils.configToString(serdeProperties); - hiveSyncConfig.tableProperties = ConfigUtils.configToString(tableProperties); String instantTime = "100"; HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata); - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); + reinitHiveSyncClient(); + reSyncHiveTable(); SessionState.start(HiveTestUtil.getHiveConf()); Driver hiveDriver = new org.apache.hadoop.hive.ql.Driver(HiveTestUtil.getHiveConf()); - String dbTableName = hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName; + String dbTableName = HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME; hiveDriver.run("SHOW TBLPROPERTIES " + dbTableName); List results = new ArrayList<>(); hiveDriver.getResults(results); @@ -273,10 +263,10 @@ public void testSyncCOWTableWithProperties(boolean useSchemaFromCommitMetadata, String sparkTableProperties = getSparkTableProperties(syncAsDataSourceTable, useSchemaFromCommitMetadata); assertEquals( "EXTERNAL\tTRUE\n" - + "last_commit_time_sync\t100\n" - + sparkTableProperties - + "tp_0\tp0\n" - + "tp_1\tp1", tblPropertiesWithoutDdlTime); + + "last_commit_time_sync\t100\n" + + sparkTableProperties + + "tp_0\tp0\n" + + "tp_1\tp1", tblPropertiesWithoutDdlTime); assertTrue(results.get(results.size() - 1).startsWith("transient_lastDdlTime")); results.clear(); @@ -284,7 +274,7 @@ public void testSyncCOWTableWithProperties(boolean useSchemaFromCommitMetadata, hiveDriver.run("SHOW CREATE TABLE " + dbTableName); hiveDriver.getResults(results); String ddl = String.join("\n", results); - assertTrue(ddl.contains("'path'='" + hiveSyncConfig.basePath + "'")); + assertTrue(ddl.contains("'path'='" + HiveTestUtil.basePath + "'")); if (syncAsDataSourceTable) { assertTrue(ddl.contains("'" + ConfigUtils.IS_QUERY_AS_RO_TABLE + "'='false'")); } @@ -293,33 +283,33 @@ public void testSyncCOWTableWithProperties(boolean useSchemaFromCommitMetadata, private String getSparkTableProperties(boolean syncAsDataSourceTable, boolean useSchemaFromCommitMetadata) { if (syncAsDataSourceTable) { if (useSchemaFromCommitMetadata) { - return "spark.sql.sources.provider\thudi\n" - + "spark.sql.sources.schema.numPartCols\t1\n" - + "spark.sql.sources.schema.numParts\t1\n" - + "spark.sql.sources.schema.part.0\t{\"type\":\"struct\",\"fields\":" - + "[{\"name\":\"_hoodie_commit_time\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," - + "{\"name\":\"_hoodie_commit_seqno\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," - + "{\"name\":\"_hoodie_record_key\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," - + "{\"name\":\"_hoodie_partition_path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," - + "{\"name\":\"_hoodie_file_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," - + "{\"name\":\"name\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}," - + "{\"name\":\"favorite_number\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}}," - + "{\"name\":\"favorite_color\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}," - + "{\"name\":\"datestr\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}]}\n" - + "spark.sql.sources.schema.partCol.0\tdatestr\n"; + return "spark.sql.sources.provider\thudi\n" + + "spark.sql.sources.schema.numPartCols\t1\n" + + "spark.sql.sources.schema.numParts\t1\n" + + "spark.sql.sources.schema.part.0\t{\"type\":\"struct\",\"fields\":" + + "[{\"name\":\"_hoodie_commit_time\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," + + "{\"name\":\"_hoodie_commit_seqno\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," + + "{\"name\":\"_hoodie_record_key\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," + + "{\"name\":\"_hoodie_partition_path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," + + "{\"name\":\"_hoodie_file_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}," + + "{\"name\":\"name\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}," + + "{\"name\":\"favorite_number\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}}," + + "{\"name\":\"favorite_color\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}," + + "{\"name\":\"datestr\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}]}\n" + + "spark.sql.sources.schema.partCol.0\tdatestr\n"; } else { return "spark.sql.sources.provider\thudi\n" - + "spark.sql.sources.schema.numPartCols\t1\n" - + "spark.sql.sources.schema.numParts\t1\n" - + "spark.sql.sources.schema.part.0\t{\"type\":\"struct\",\"fields\":[{\"name\":\"name\",\"type\":" - + "\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"favorite_number\",\"type\":\"integer\"," - + "\"nullable\":false,\"metadata\":{}},{\"name\":\"favorite_color\",\"type\":\"string\",\"nullable\":false," - + "\"metadata\":{}}]}\n" - + "{\"name\":\"datestr\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}]}\n" - + "spark.sql.sources.schema.partCol.0\tdatestr\n"; + + "spark.sql.sources.schema.numPartCols\t1\n" + + "spark.sql.sources.schema.numParts\t1\n" + + "spark.sql.sources.schema.part.0\t{\"type\":\"struct\",\"fields\":[{\"name\":\"name\",\"type\":" + + "\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"favorite_number\",\"type\":\"integer\"," + + "\"nullable\":false,\"metadata\":{}},{\"name\":\"favorite_color\",\"type\":\"string\",\"nullable\":false," + + "\"metadata\":{}}]}\n" + + "{\"name\":\"datestr\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}]}\n" + + "spark.sql.sources.schema.partCol.0\tdatestr\n"; } } else { - return ""; + return ""; } } @@ -328,11 +318,9 @@ private String getSparkTableProperties(boolean syncAsDataSourceTable, boolean us public void testSyncMORTableWithProperties(boolean useSchemaFromCommitMetadata, boolean syncAsDataSourceTable, String syncMode) throws Exception { - HiveSyncConfig hiveSyncConfig = HiveTestUtil.hiveSyncConfig; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 3; Map serdeProperties = new HashMap() { { - put("path", hiveSyncConfig.basePath); + put("path", HiveTestUtil.basePath); } }; @@ -342,20 +330,21 @@ public void testSyncMORTableWithProperties(boolean useSchemaFromCommitMetadata, put("tp_1", "p1"); } }; - hiveSyncConfig.syncAsSparkDataSourceTable = syncAsDataSourceTable; - hiveSyncConfig.syncMode = syncMode; - hiveSyncConfig.serdeProperties = ConfigUtils.configToString(serdeProperties); - hiveSyncConfig.tableProperties = ConfigUtils.configToString(tableProperties); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), String.valueOf(syncAsDataSourceTable)); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES.key(), ConfigUtils.configToString(serdeProperties)); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_TABLE_PROPERTIES.key(), ConfigUtils.configToString(tableProperties)); + String instantTime = "100"; String deltaCommitTime = "101"; HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true, useSchemaFromCommitMetadata); - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - tool.syncHoodieTable(); + reinitHiveSyncClient(); + reSyncHiveTable(); - String roTableName = hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE; - String rtTableName = hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; + String roTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE; + String rtTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; String[] tableNames = new String[] {roTableName, rtTableName}; String[] readAsOptimizedResults = new String[] {"true", "false"}; @@ -364,8 +353,8 @@ public void testSyncMORTableWithProperties(boolean useSchemaFromCommitMetadata, Driver hiveDriver = new org.apache.hadoop.hive.ql.Driver(HiveTestUtil.getHiveConf()); String sparkTableProperties = getSparkTableProperties(syncAsDataSourceTable, useSchemaFromCommitMetadata); - for (int i = 0;i < 2; i++) { - String dbTableName = hiveSyncConfig.databaseName + "." + tableNames[i]; + for (int i = 0; i < 2; i++) { + String dbTableName = HiveTestUtil.DB_NAME + "." + tableNames[i]; String readAsOptimized = readAsOptimizedResults[i]; hiveDriver.run("SHOW TBLPROPERTIES " + dbTableName); @@ -376,10 +365,10 @@ public void testSyncMORTableWithProperties(boolean useSchemaFromCommitMetadata, results.subList(0, results.size() - 1)); assertEquals( "EXTERNAL\tTRUE\n" - + "last_commit_time_sync\t101\n" - + sparkTableProperties - + "tp_0\tp0\n" - + "tp_1\tp1", tblPropertiesWithoutDdlTime); + + "last_commit_time_sync\t101\n" + + sparkTableProperties + + "tp_0\tp0\n" + + "tp_1\tp1", tblPropertiesWithoutDdlTime); assertTrue(results.get(results.size() - 1).startsWith("transient_lastDdlTime")); results.clear(); @@ -387,7 +376,7 @@ public void testSyncMORTableWithProperties(boolean useSchemaFromCommitMetadata, hiveDriver.run("SHOW CREATE TABLE " + dbTableName); hiveDriver.getResults(results); String ddl = String.join("\n", results); - assertTrue(ddl.contains("'path'='" + hiveSyncConfig.basePath + "'")); + assertTrue(ddl.contains("'path'='" + HiveTestUtil.basePath + "'")); assertTrue(ddl.toLowerCase().contains("create external table")); if (syncAsDataSourceTable) { assertTrue(ddl.contains("'" + ConfigUtils.IS_QUERY_AS_RO_TABLE + "'='" + readAsOptimized + "'")); @@ -400,19 +389,18 @@ public void testSyncMORTableWithProperties(boolean useSchemaFromCommitMetadata, public void testSyncManagedTable(boolean useSchemaFromCommitMetadata, boolean isManagedTable, String syncMode) throws Exception { - HiveSyncConfig hiveSyncConfig = HiveTestUtil.hiveSyncConfig; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE.key(), String.valueOf(isManagedTable)); - hiveSyncConfig.syncMode = syncMode; - hiveSyncConfig.createManagedTable = isManagedTable; String instantTime = "100"; HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata); - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - tool.syncHoodieTable(); + reinitHiveSyncClient(); + reSyncHiveTable(); SessionState.start(HiveTestUtil.getHiveConf()); Driver hiveDriver = new org.apache.hadoop.hive.ql.Driver(HiveTestUtil.getHiveConf()); - String dbTableName = hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName; + String dbTableName = HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME; hiveDriver.run("SHOW TBLPROPERTIES " + dbTableName); List results = new ArrayList<>(); @@ -429,37 +417,29 @@ public void testSyncManagedTable(boolean useSchemaFromCommitMetadata, @ParameterizedTest @MethodSource("syncMode") public void testSyncWithSchema(String syncMode) throws Exception { - - hiveSyncConfig.syncMode = syncMode; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String commitTime = "100"; HiveTestUtil.createCOWTableWithSchema(commitTime, "/complex.schema.avsc"); - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - HoodieHiveClient hiveClient = - new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - assertEquals(1, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + reinitHiveSyncClient(); + reSyncHiveTable(); + assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); - assertEquals(commitTime, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(commitTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); } @ParameterizedTest @MethodSource("syncMode") public void testSyncIncremental(String syncMode) throws Exception { - - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String commitTime1 = "100"; HiveTestUtil.createCOWTable(commitTime1, 5, true); - HoodieHiveClient hiveClient = - new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - assertEquals(5, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + reinitHiveSyncClient(); + reSyncHiveTable(); + assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); - assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); // Now lets create more partitions and these are the only ones which needs to be synced @@ -468,37 +448,31 @@ public void testSyncIncremental(String syncMode) throws Exception { HiveTestUtil.addCOWPartitions(1, true, true, dateTime, commitTime2); // Lets do the sync - hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); + reSyncHiveTable(); List writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.of(commitTime1)); assertEquals(1, writtenPartitionsSince.size(), "We should have one partition written after 100 commit"); - List hivePartitions = hiveClient.scanTablePartitions(hiveSyncConfig.tableName); - List partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince); + List partitionEvents = hiveClient.getPartitionEvents(HiveTestUtil.TABLE_NAME, writtenPartitionsSince); assertEquals(1, partitionEvents.size(), "There should be only one partition event"); assertEquals(PartitionEventType.ADD, partitionEvents.iterator().next().eventType, "The one partition event must of type ADD"); - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); // Sync should add the one partition - assertEquals(6, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + reSyncHiveTable(); + assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "The one partition we wrote should be added to hive"); - assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be 101"); } @ParameterizedTest @MethodSource("syncMode") public void testSyncIncrementalWithSchemaEvolution(String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String commitTime1 = "100"; HiveTestUtil.createCOWTable(commitTime1, 5, true); - HoodieHiveClient hiveClient = - new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); + reinitHiveSyncClient(); + reSyncHiveTable(); - int fields = hiveClient.getTableSchema(hiveSyncConfig.tableName).size(); + int fields = hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(); // Now lets create more partitions and these are the only ones which needs to be synced ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6); @@ -506,51 +480,48 @@ public void testSyncIncrementalWithSchemaEvolution(String syncMode) throws Excep HiveTestUtil.addCOWPartitions(1, false, true, dateTime, commitTime2); // Lets do the sync - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - - assertEquals(fields + 3, hiveClient.getTableSchema(hiveSyncConfig.tableName).size(), + reinitHiveSyncClient(); + reSyncHiveTable(); + assertEquals(fields + 3, hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), "Hive Schema has evolved and should not be 3 more field"); - assertEquals("BIGINT", hiveClient.getTableSchema(hiveSyncConfig.tableName).get("favorite_number"), + assertEquals("BIGINT", hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get("favorite_number"), "Hive Schema has evolved - Field favorite_number has evolved from int to long"); - assertTrue(hiveClient.getTableSchema(hiveSyncConfig.tableName).containsKey("favorite_movie"), + assertTrue(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).containsKey("favorite_movie"), "Hive Schema has evolved - Field favorite_movie was added"); // Sync should add the one partition - assertEquals(6, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "The one partition we wrote should be added to hive"); - assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be 101"); } @ParameterizedTest @MethodSource("syncModeAndSchemaFromCommitMetadata") public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String instantTime = "100"; String deltaCommitTime = "101"; HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true, useSchemaFromCommitMetadata); - String roTableName = hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE; - HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - assertFalse(hiveClient.doesTableExist(roTableName), "Table " + hiveSyncConfig.tableName + " should not exist initially"); + String roTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE; + reinitHiveSyncClient(); + assertFalse(hiveClient.tableExists(roTableName), "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); + reSyncHiveTable(); - assertTrue(hiveClient.doesTableExist(roTableName), "Table " + roTableName + " should exist after sync completes"); + assertTrue(hiveClient.tableExists(roTableName), "Table " + roTableName + " should exist after sync completes"); if (useSchemaFromCommitMetadata) { assertEquals(hiveClient.getTableSchema(roTableName).size(), - SchemaTestUtil.getSimpleSchema().getFields().size() + hiveSyncConfig.partitionFields.size() + SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize() + HoodieRecord.HOODIE_META_COLUMNS.size(), "Hive Schema should match the table schema + partition field"); } else { // The data generated and schema in the data file do not have metadata columns, so we need a separate check. assertEquals(hiveClient.getTableSchema(roTableName).size(), - SchemaTestUtil.getSimpleSchema().getFields().size() + hiveSyncConfig.partitionFields.size(), + SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(), "Hive Schema should match the table schema + partition field"); } @@ -568,19 +539,18 @@ public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String sync HiveTestUtil.addMORPartitions(1, true, false, useSchemaFromCommitMetadata, dateTime, commitTime2, deltaCommitTime2); // Lets do the sync - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); + reinitHiveSyncClient(); + reSyncHiveTable(); if (useSchemaFromCommitMetadata) { assertEquals(hiveClient.getTableSchema(roTableName).size(), - SchemaTestUtil.getEvolvedSchema().getFields().size() + hiveSyncConfig.partitionFields.size() + SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize() + HoodieRecord.HOODIE_META_COLUMNS.size(), "Hive Schema should match the evolved table schema + partition field"); } else { // The data generated and schema in the data file do not have metadata columns, so we need a separate check. assertEquals(hiveClient.getTableSchema(roTableName).size(), - SchemaTestUtil.getEvolvedSchema().getFields().size() + hiveSyncConfig.partitionFields.size(), + SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(), "Hive Schema should match the evolved table schema + partition field"); } // Sync should add the one partition @@ -593,42 +563,38 @@ public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String sync @ParameterizedTest @MethodSource("syncModeAndSchemaFromCommitMetadata") public void testSyncMergeOnReadRT(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String instantTime = "100"; String deltaCommitTime = "101"; - String snapshotTableName = hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; + String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true, useSchemaFromCommitMetadata); - HoodieHiveClient hiveClientRT = - new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - - assertFalse(hiveClientRT.doesTableExist(snapshotTableName), - "Table " + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + reinitHiveSyncClient(); + assertFalse(hiveClient.tableExists(snapshotTableName), + "Table " + HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + " should not exist initially"); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); + reSyncHiveTable(); - assertTrue(hiveClientRT.doesTableExist(snapshotTableName), - "Table " + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + assertTrue(hiveClient.tableExists(snapshotTableName), + "Table " + HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + " should exist after sync completes"); if (useSchemaFromCommitMetadata) { - assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(), - SchemaTestUtil.getSimpleSchema().getFields().size() + hiveSyncConfig.partitionFields.size() + assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), + SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize() + HoodieRecord.HOODIE_META_COLUMNS.size(), "Hive Schema should match the table schema + partition field"); } else { // The data generated and schema in the data file do not have metadata columns, so we need a separate check. - assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(), - SchemaTestUtil.getSimpleSchema().getFields().size() + hiveSyncConfig.partitionFields.size(), + assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), + SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize(), "Hive Schema should match the table schema + partition field"); } - assertEquals(5, hiveClientRT.scanTablePartitions(snapshotTableName).size(), + assertEquals(5, hiveClient.scanTablePartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote"); - assertEquals(deltaCommitTime, hiveClientRT.getLastCommitTimeSynced(snapshotTableName).get(), + assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); // Now lets create more partitions and these are the only ones which needs to be synced @@ -639,57 +605,52 @@ public void testSyncMergeOnReadRT(boolean useSchemaFromCommitMetadata, String sy HiveTestUtil.addCOWPartitions(1, true, useSchemaFromCommitMetadata, dateTime, commitTime2); HiveTestUtil.addMORPartitions(1, true, false, useSchemaFromCommitMetadata, dateTime, commitTime2, deltaCommitTime2); // Lets do the sync - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - hiveClientRT = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); + reinitHiveSyncClient(); + reSyncHiveTable(); if (useSchemaFromCommitMetadata) { - assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(), - SchemaTestUtil.getEvolvedSchema().getFields().size() + hiveSyncConfig.partitionFields.size() + assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), + SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize() + HoodieRecord.HOODIE_META_COLUMNS.size(), "Hive Schema should match the evolved table schema + partition field"); } else { // The data generated and schema in the data file do not have metadata columns, so we need a separate check. - assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(), - SchemaTestUtil.getEvolvedSchema().getFields().size() + hiveSyncConfig.partitionFields.size(), + assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), + SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize(), "Hive Schema should match the evolved table schema + partition field"); } // Sync should add the one partition - assertEquals(6, hiveClientRT.scanTablePartitions(snapshotTableName).size(), + assertEquals(6, hiveClient.scanTablePartitions(snapshotTableName).size(), "The 2 partitions we wrote should be added to hive"); - assertEquals(deltaCommitTime2, hiveClientRT.getLastCommitTimeSynced(snapshotTableName).get(), + assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(), "The last commit that was synced should be 103"); } @ParameterizedTest @MethodSource("syncMode") public void testMultiPartitionKeySync(String syncMode) throws Exception { - - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String instantTime = "100"; HiveTestUtil.createCOWTable(instantTime, 5, true); - HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(HiveTestUtil.hiveSyncConfig); - hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName(); - hiveSyncConfig.tableName = "multi_part_key"; - hiveSyncConfig.partitionFields = Arrays.asList("year", "month", "day"); - HiveTestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day"); - HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - assertFalse(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should not exist initially"); + HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME); + + reinitHiveSyncClient(); + assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should exist after sync completes"); - assertEquals(hiveClient.getTableSchema(hiveSyncConfig.tableName).size(), + reSyncHiveTable(); + assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); + assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), hiveClient.getDataSchema().getColumns().size() + 3, "Hive Schema should match the table schema + partition fields"); - assertEquals(5, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); - assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); // HoodieHiveClient had a bug where partition vals were sorted @@ -698,41 +659,35 @@ public void testMultiPartitionKeySync(String syncMode) throws Exception { String commitTime2 = "101"; HiveTestUtil.addCOWPartition("2010/01/02", true, true, commitTime2); - hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); + reinitHiveSyncClient(); List writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.of(instantTime)); assertEquals(1, writtenPartitionsSince.size(), "We should have one partition written after 100 commit"); - List hivePartitions = hiveClient.scanTablePartitions(hiveSyncConfig.tableName); - List partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince); + List partitionEvents = hiveClient.getPartitionEvents(HiveTestUtil.TABLE_NAME, writtenPartitionsSince); assertEquals(1, partitionEvents.size(), "There should be only one partition event"); assertEquals(PartitionEventType.ADD, partitionEvents.iterator().next().eventType, "The one partition event must of type ADD"); - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - + reSyncHiveTable(); // Sync should add the one partition - assertEquals(6, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); - assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be 101"); // create partition "2010/02/01" and ensure sync works String commitTime3 = "102"; HiveTestUtil.addCOWPartition("2010/02/01", true, true, commitTime3); - HiveTestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); - - hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); + HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME); - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - - assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should exist after sync completes"); - assertEquals(hiveClient.getTableSchema(hiveSyncConfig.tableName).size(), + reinitHiveSyncClient(); + reSyncHiveTable(); + assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); + assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), hiveClient.getDataSchema().getColumns().size() + 3, "Hive Schema should match the table schema + partition fields"); - assertEquals(7, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + assertEquals(7, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); - assertEquals(commitTime3, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), + assertEquals(commitTime3, hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); assertEquals(1, hiveClient.getPartitionsWrittenToSince(Option.of(commitTime2)).size()); } @@ -740,62 +695,53 @@ public void testMultiPartitionKeySync(String syncMode) throws Exception { @ParameterizedTest @MethodSource("syncMode") public void testNonPartitionedSync(String syncMode) throws Exception { - - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String instantTime = "100"; HiveTestUtil.createCOWTable(instantTime, 5, true); - - HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(HiveTestUtil.hiveSyncConfig); // Set partition value extractor to NonPartitionedExtractor - hiveSyncConfig.partitionValueExtractorClass = NonPartitionedExtractor.class.getCanonicalName(); - hiveSyncConfig.tableName = "non_partitioned"; - hiveSyncConfig.partitionFields = Arrays.asList("year", "month", "day"); - HiveTestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); - - HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - assertFalse(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should not exist initially"); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName()); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year, month, day"); + + HiveTestUtil.getCreatedTablesSet().add(HiveTestUtil.DB_NAME + "." + HiveTestUtil.TABLE_NAME); + + reinitHiveSyncClient(); + assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should exist after sync completes"); - assertEquals(hiveClient.getTableSchema(hiveSyncConfig.tableName).size(), + reSyncHiveTable(); + assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); + assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), hiveClient.getDataSchema().getColumns().size(), "Hive Schema should match the table schema,ignoring the partition fields"); - assertEquals(0, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), + assertEquals(0, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table should not have partitions because of the NonPartitionedExtractor"); } @ParameterizedTest @MethodSource("syncMode") public void testReadSchemaForMOR(String syncMode) throws Exception { - - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); String commitTime = "100"; - String snapshotTableName = hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; + String snapshotTableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; HiveTestUtil.createMORTable(commitTime, "", 5, false, true); - HoodieHiveClient hiveClientRT = - new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); + reinitHiveSyncClient(); - assertFalse(hiveClientRT.doesTableExist(snapshotTableName), "Table " + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + assertFalse(hiveClient.tableExists(snapshotTableName), "Table " + HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + " should not exist initially"); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); + reSyncHiveTable(); - assertTrue(hiveClientRT.doesTableExist(snapshotTableName), "Table " + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + assertTrue(hiveClient.tableExists(snapshotTableName), "Table " + HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE + " should exist after sync completes"); // Schema being read from compacted base files - assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(), - SchemaTestUtil.getSimpleSchema().getFields().size() + hiveSyncConfig.partitionFields.size() + assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), + SchemaTestUtil.getSimpleSchema().getFields().size() + getPartitionFieldSize() + HoodieRecord.HOODIE_META_COLUMNS.size(), "Hive Schema should match the table schema + partition field"); - assertEquals(5, hiveClientRT.scanTablePartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote"); + assertEquals(5, hiveClient.scanTablePartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote"); // Now lets create more partitions and these are the only ones which needs to be synced ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6); @@ -804,84 +750,78 @@ public void testReadSchemaForMOR(String syncMode) throws Exception { HiveTestUtil.addMORPartitions(1, true, false, true, dateTime, commitTime2, deltaCommitTime2); // Lets do the sync - tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - tool.syncHoodieTable(); - hiveClientRT = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); + reinitHiveSyncClient(); + reSyncHiveTable(); // Schema being read from the log files - assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(), - SchemaTestUtil.getEvolvedSchema().getFields().size() + hiveSyncConfig.partitionFields.size() + assertEquals(hiveClient.getTableSchema(snapshotTableName).size(), + SchemaTestUtil.getEvolvedSchema().getFields().size() + getPartitionFieldSize() + HoodieRecord.HOODIE_META_COLUMNS.size(), "Hive Schema should match the evolved table schema + partition field"); // Sync should add the one partition - assertEquals(6, hiveClientRT.scanTablePartitions(snapshotTableName).size(), "The 1 partition we wrote should be added to hive"); - assertEquals(deltaCommitTime2, hiveClientRT.getLastCommitTimeSynced(snapshotTableName).get(), + assertEquals(6, hiveClient.scanTablePartitions(snapshotTableName).size(), "The 1 partition we wrote should be added to hive"); + assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(snapshotTableName).get(), "The last commit that was synced should be 103"); } @Test public void testConnectExceptionIgnoreConfigSet() throws IOException, URISyntaxException, HiveException, MetaException { - hiveSyncConfig.useJdbc = true; - HiveTestUtil.hiveSyncConfig.useJdbc = true; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; String instantTime = "100"; HiveTestUtil.createCOWTable(instantTime, 5, false); - HoodieHiveClient hiveClient = - new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem); - assertFalse(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should not exist initially"); - // Lets do the sync - - HiveSyncConfig syncToolConfig = HiveSyncConfig.copy(hiveSyncConfig); - syncToolConfig.ignoreExceptions = true; - syncToolConfig.jdbcUrl = HiveTestUtil.hiveSyncConfig.jdbcUrl - .replace(String.valueOf(HiveTestUtil.hiveTestService.getHiveServerPort()), String.valueOf(NetworkTestUtils.nextFreePort())); - HiveSyncTool tool = new HiveSyncTool(syncToolConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - tool.syncHoodieTable(); + reinitHiveSyncClient(); + HoodieHiveClient prevHiveClient = hiveClient; + assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); - assertFalse(hiveClient.doesTableExist(hiveSyncConfig.tableName), - "Table " + hiveSyncConfig.tableName + " should not exist initially"); + // Lets do the sync + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS.key(), "true"); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveSyncProps.getString(HiveSyncConfig.HIVE_URL.key()) + .replace(String.valueOf(HiveTestUtil.hiveTestService.getHiveServerPort()), String.valueOf(NetworkTestUtils.nextFreePort()))); + reinitHiveSyncClient(); + reSyncHiveTable(); + + assertNull(hiveClient); + assertFalse(prevHiveClient.tableExists(HiveTestUtil.TABLE_NAME), + "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); } private void verifyOldParquetFileTest(HoodieHiveClient hiveClient, String emptyCommitTime) throws Exception { - assertTrue(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + " should exist after sync completes"); - assertEquals(hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).size(), + assertTrue(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should exist after sync completes"); + assertEquals(hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), hiveClient.getDataSchema().getColumns().size() + 1, "Hive Schema should match the table schema + partition field"); - assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),"Table partitions should match the number of partitions we wrote"); + assertEquals(1, hiveClient.scanTablePartitions(HiveTestUtil.TABLE_NAME).size(), "Table partitions should match the number of partitions we wrote"); assertEquals(emptyCommitTime, - hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),"The last commit that was synced should be updated in the TBLPROPERTIES"); + hiveClient.getLastCommitTimeSynced(HiveTestUtil.TABLE_NAME).get(), "The last commit that was synced should be updated in the TBLPROPERTIES"); // make sure correct schema is picked Schema schema = SchemaTestUtil.getSimpleSchema(); for (Field field : schema.getFields()) { assertEquals(field.schema().getType().getName(), - hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).get(field.name()).toLowerCase(), + hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get(field.name()).toLowerCase(), String.format("Hive Schema Field %s was added", field)); } assertEquals("string", - hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).get("datestr").toLowerCase(), "Hive Schema Field datestr was added"); + hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).get("datestr").toLowerCase(), "Hive Schema Field datestr was added"); assertEquals(schema.getFields().size() + 1 + HoodieRecord.HOODIE_META_COLUMNS.size(), - hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).size(),"Hive Schema fields size"); + hiveClient.getTableSchema(HiveTestUtil.TABLE_NAME).size(), "Hive Schema fields size"); } @ParameterizedTest @MethodSource("syncMode") public void testPickingOlderParquetFileIfLatestIsEmptyCommit(String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); final String commitTime = "100"; HiveTestUtil.createCOWTable(commitTime, 1, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); // create empty commit final String emptyCommitTime = "200"; HiveTestUtil.createCommitFileWithSchema(commitMetadata, emptyCommitTime, true); - HoodieHiveClient hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - assertFalse(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName),"Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist initially"); + reinitHiveSyncClient(); + assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); - HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - tool.syncHoodieTable(); + reinitHiveSyncClient(); + reSyncHiveTable(); verifyOldParquetFileTest(hiveClient, emptyCommitTime); } @@ -889,8 +829,7 @@ public void testPickingOlderParquetFileIfLatestIsEmptyCommit(String syncMode) th @ParameterizedTest @MethodSource("syncMode") public void testNotPickingOlderParquetFileWhenLatestCommitReadFails(String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); final String commitTime = "100"; HiveTestUtil.createCOWTable(commitTime, 1, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); @@ -904,15 +843,13 @@ public void testNotPickingOlderParquetFileWhenLatestCommitReadFails(String syncM final String emptyCommitTime = "200"; HiveTestUtil.createCommitFile(commitMetadata, emptyCommitTime); - HoodieHiveClient hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); + reinitHiveSyncClient(); assertFalse( - hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName),"Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist initially"); - - HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); + hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); + HiveSyncTool tool = new HiveSyncTool(hiveSyncProps, getHiveConf(), fileSystem); // now delete the evolved commit instant - Path fullPath = new Path(HiveTestUtil.hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + Path fullPath = new Path(HiveTestUtil.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + hiveClient.getActiveTimeline().getInstants() .filter(inst -> inst.getTimestamp().equals(commitTime2)) .findFirst().get().getFileName()); @@ -925,15 +862,13 @@ public void testNotPickingOlderParquetFileWhenLatestCommitReadFails(String syncM } // table should not be synced yet - assertFalse( - hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName),"Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist at all"); + assertFalse(hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should not exist at all"); } @ParameterizedTest @MethodSource("syncMode") public void testNotPickingOlderParquetFileWhenLatestCommitReadFailsForExistingTable(String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); final String commitTime = "100"; HiveTestUtil.createCOWTable(commitTime, 1, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); @@ -941,13 +876,11 @@ public void testNotPickingOlderParquetFileWhenLatestCommitReadFailsForExistingTa final String emptyCommitTime = "200"; HiveTestUtil.createCommitFileWithSchema(commitMetadata, emptyCommitTime, true); //HiveTestUtil.createCommitFile(commitMetadata, emptyCommitTime); - HoodieHiveClient hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); + reinitHiveSyncClient(); assertFalse( - hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist initially"); + hiveClient.tableExists(HiveTestUtil.TABLE_NAME), "Table " + HiveTestUtil.TABLE_NAME + " should not exist initially"); - HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - tool.syncHoodieTable(); + reSyncHiveTable(); verifyOldParquetFileTest(hiveClient, emptyCommitTime); @@ -958,18 +891,19 @@ public void testNotPickingOlderParquetFileWhenLatestCommitReadFailsForExistingTa //HiveTestUtil.createCommitFileWithSchema(commitMetadata, "400", false); // create another empty commit //HiveTestUtil.createCommitFile(commitMetadata, "400"); // create another empty commit - tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - HoodieHiveClient hiveClientLatest = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); + reinitHiveSyncClient(); // now delete the evolved commit instant - Path fullPath = new Path(HiveTestUtil.hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" - + hiveClientLatest.getActiveTimeline().getInstants() - .filter(inst -> inst.getTimestamp().equals(commitTime2)) - .findFirst().get().getFileName()); + Path fullPath = new Path(HiveTestUtil.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + hiveClient.getActiveTimeline().getInstants() + .filter(inst -> inst.getTimestamp().equals(commitTime2)) + .findFirst().get().getFileName()); assertTrue(HiveTestUtil.fileSystem.delete(fullPath, false)); try { - tool.syncHoodieTable(); + reSyncHiveTable(); } catch (RuntimeException e) { // we expect the table sync to fail + } finally { + reinitHiveSyncClient(); } // old sync values should be left intact @@ -979,15 +913,13 @@ public void testNotPickingOlderParquetFileWhenLatestCommitReadFailsForExistingTa @ParameterizedTest @MethodSource("syncMode") public void testTypeConverter(String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); HiveTestUtil.createCOWTable("100", 5, true); // create database. - ddlExecutor.runSQL("create database " + hiveSyncConfig.databaseName); - HoodieHiveClient hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - String tableName = HiveTestUtil.hiveSyncConfig.tableName; - String tableAbsoluteName = String.format(" `%s.%s` ", HiveTestUtil.hiveSyncConfig.databaseName, tableName); + ddlExecutor.runSQL("create database " + HiveTestUtil.DB_NAME); + reinitHiveSyncClient(); + String tableName = HiveTestUtil.TABLE_NAME; + String tableAbsoluteName = String.format(" `%s.%s` ", HiveTestUtil.DB_NAME, tableName); String dropTableSql = String.format("DROP TABLE IF EXISTS %s ", tableAbsoluteName); String createTableSqlPrefix = String.format("CREATE TABLE IF NOT EXISTS %s ", tableAbsoluteName); String errorMsg = "An error occurred in decimal type converting."; @@ -1021,31 +953,40 @@ public void testTypeConverter(String syncMode) throws Exception { @ParameterizedTest @MethodSource("syncMode") public void testSyncWithoutDiffs(String syncMode) throws Exception { - hiveSyncConfig.syncMode = syncMode; - hiveSyncConfig.isConditionalSync = true; - HiveTestUtil.hiveSyncConfig.batchSyncNum = 2; - String tableName = HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; + String tableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE; + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_CONDITIONAL_SYNC.key(), "true"); String commitTime0 = "100"; String commitTime1 = "101"; String commitTime2 = "102"; HiveTestUtil.createMORTable(commitTime0, commitTime1, 2, true, true); - HoodieHiveClient hiveClient = - new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - - HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - tool.syncHoodieTable(); + reinitHiveSyncClient(); + reSyncHiveTable(); - assertTrue(hiveClient.doesTableExist(tableName)); + assertTrue(hiveClient.tableExists(tableName)); assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(tableName).get()); HiveTestUtil.addMORPartitions(0, true, true, true, ZonedDateTime.now().plusDays(2), commitTime1, commitTime2); - tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); - tool.syncHoodieTable(); - hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem); + reSyncHiveTable(); assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(tableName).get()); } + private void reSyncHiveTable() { + hiveSyncTool.syncHoodieTable(); + // we need renew the hiveclient after tool.syncHoodieTable(), because it will close hive + // session, then lead to connection retry, we can see there is a exception at log. + reinitHiveSyncClient(); + } + + private void reinitHiveSyncClient() { + hiveSyncTool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf(), fileSystem); + hiveClient = (HoodieHiveClient) hiveSyncTool.hoodieHiveClient; + } + + private int getPartitionFieldSize() { + return hiveSyncProps.getString(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key()).split(",").length; + } } diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestMultiPartKeysValueExtractor.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestMultiPartKeysValueExtractor.java index 47d4d2759e781..a07bfb1039454 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestMultiPartKeysValueExtractor.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestMultiPartKeysValueExtractor.java @@ -18,6 +18,8 @@ package org.apache.hudi.hive; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; + import java.util.ArrayList; import java.util.List; import org.junit.jupiter.api.Test; diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java index ba5a544af18b8..81765ac2860e2 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java @@ -18,6 +18,9 @@ package org.apache.hudi.hive; +import org.apache.hudi.sync.common.HiveStylePartitionValueExtractor; +import org.apache.hudi.sync.common.SlashEncodedHourPartitionValueExtractor; + import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.List; diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java index a3bc2268dcac2..4ea2599805c63 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieBaseFile; @@ -72,7 +73,6 @@ import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -86,16 +86,21 @@ @SuppressWarnings("SameParameterValue") public class HiveTestUtil { + public static final String DB_NAME = "testdb"; + public static String TABLE_NAME = "test1"; + public static String basePath; + public static TypedProperties hiveSyncProps; + public static HiveTestService hiveTestService; + public static FileSystem fileSystem; + public static QueryBasedDDLExecutor ddlExecutor; + private static ZooKeeperServer zkServer; private static HiveServer2 hiveServer; - public static HiveTestService hiveTestService; private static ZookeeperTestService zkService; private static Configuration configuration; - public static HiveSyncConfig hiveSyncConfig; + private static HiveSyncConfig hiveSyncConfig; private static DateTimeFormatter dtfOut; - public static FileSystem fileSystem; private static Set createdTablesSet = new HashSet<>(); - public static QueryBasedDDLExecutor ddlExecutor; public static void setUp() throws IOException, InterruptedException, HiveException, MetaException { configuration = new Configuration(); @@ -109,16 +114,21 @@ public static void setUp() throws IOException, InterruptedException, HiveExcepti } fileSystem = FileSystem.get(configuration); - hiveSyncConfig = new HiveSyncConfig(); - hiveSyncConfig.jdbcUrl = hiveTestService.getJdbcHive2Url(); - hiveSyncConfig.hiveUser = ""; - hiveSyncConfig.hivePass = ""; - hiveSyncConfig.databaseName = "testdb"; - hiveSyncConfig.tableName = "test1"; - hiveSyncConfig.basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString(); - hiveSyncConfig.assumeDatePartitioning = true; - hiveSyncConfig.usePreApacheInputFormat = false; - hiveSyncConfig.partitionFields = Collections.singletonList("datestr"); + basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString(); + + hiveSyncProps = new TypedProperties(); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveTestService.getJdbcHive2Url()); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USER.key(), ""); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_PASS.key(), ""); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), DB_NAME); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), TABLE_NAME); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_BASE_PATH, basePath); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "true"); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false"); + hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); + hiveSyncProps.setProperty(HiveSyncConfig.HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3"); + + hiveSyncConfig = new HiveSyncConfig(hiveSyncProps); dtfOut = DateTimeFormatter.ofPattern("yyyy/MM/dd"); ddlExecutor = new HiveQueryDDLExecutor(hiveSyncConfig, fileSystem, getHiveConf()); @@ -127,18 +137,18 @@ public static void setUp() throws IOException, InterruptedException, HiveExcepti } public static void clear() throws IOException, HiveException, MetaException { - fileSystem.delete(new Path(hiveSyncConfig.basePath), true); + fileSystem.delete(new Path(basePath), true); HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.COPY_ON_WRITE) - .setTableName(hiveSyncConfig.tableName) + .setTableName(TABLE_NAME) .setPayloadClass(HoodieAvroPayload.class) - .initTable(configuration, hiveSyncConfig.basePath); + .initTable(configuration, basePath); for (String tableName : createdTablesSet) { ddlExecutor.runSQL("drop table if exists " + tableName); } createdTablesSet.clear(); - ddlExecutor.runSQL("drop database if exists " + hiveSyncConfig.databaseName + " cascade"); + ddlExecutor.runSQL("drop database if exists " + DB_NAME + " cascade"); } public static HiveConf getHiveConf() { @@ -159,32 +169,32 @@ public static void shutdown() { public static void createCOWTable(String instantTime, int numberOfPartitions, boolean useSchemaFromCommitMetadata) throws IOException, URISyntaxException { - Path path = new Path(hiveSyncConfig.basePath); - FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath)); + Path path = new Path(basePath); + FileIOUtils.deleteDirectory(new File(basePath)); HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.COPY_ON_WRITE) - .setTableName(hiveSyncConfig.tableName) + .setTableName(TABLE_NAME) .setPayloadClass(HoodieAvroPayload.class) - .initTable(configuration, hiveSyncConfig.basePath); + .initTable(configuration, basePath); boolean result = fileSystem.mkdirs(path); checkResult(result); ZonedDateTime dateTime = ZonedDateTime.now(); HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, useSchemaFromCommitMetadata, dateTime, instantTime); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); + createdTablesSet.add(DB_NAME + "." + TABLE_NAME); createCommitFile(commitMetadata, instantTime); } public static void createCOWTableWithSchema(String instantTime, String schemaFileName) throws IOException, URISyntaxException { - Path path = new Path(hiveSyncConfig.basePath); - FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath)); + Path path = new Path(basePath); + FileIOUtils.deleteDirectory(new File(basePath)); HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.COPY_ON_WRITE) - .setTableName(hiveSyncConfig.tableName) + .setTableName(TABLE_NAME) .setPayloadClass(HoodieAvroPayload.class) - .initTable(configuration, hiveSyncConfig.basePath); + .initTable(configuration, basePath); boolean result = fileSystem.mkdirs(path); checkResult(result); @@ -192,7 +202,7 @@ public static void createCOWTableWithSchema(String instantTime, String schemaFil HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); String partitionPath = dateTime.format(dtfOut); - Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath); + Path partPath = new Path(basePath + "/" + partitionPath); fileSystem.makeQualified(partPath); fileSystem.mkdirs(partPath); List writeStats = new ArrayList<>(); @@ -206,20 +216,20 @@ public static void createCOWTableWithSchema(String instantTime, String schemaFil writeStats.add(writeStat); writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s)); commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schema.toString()); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); + createdTablesSet.add(DB_NAME + "." + TABLE_NAME); createCommitFile(commitMetadata, instantTime); } public static void createMORTable(String commitTime, String deltaCommitTime, int numberOfPartitions, boolean createDeltaCommit, boolean useSchemaFromCommitMetadata) throws IOException, URISyntaxException, InterruptedException { - Path path = new Path(hiveSyncConfig.basePath); - FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath)); + Path path = new Path(basePath); + FileIOUtils.deleteDirectory(new File(basePath)); HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.MERGE_ON_READ) - .setTableName(hiveSyncConfig.tableName) + .setTableName(TABLE_NAME) .setPayloadClass(HoodieAvroPayload.class) - .initTable(configuration, hiveSyncConfig.basePath); + .initTable(configuration, basePath); boolean result = fileSystem.mkdirs(path); checkResult(result); @@ -227,9 +237,9 @@ public static void createMORTable(String commitTime, String deltaCommitTime, int HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, useSchemaFromCommitMetadata, dateTime, commitTime); createdTablesSet - .add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE); + .add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE); createdTablesSet - .add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE); + .add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE); HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata(); commitMetadata.getPartitionToWriteStats() .forEach((key, value) -> value.forEach(l -> compactionMetadata.addWriteStat(key, l))); @@ -248,7 +258,7 @@ public static void addCOWPartitions(int numberOfPartitions, boolean isParquetSch boolean useSchemaFromCommitMetadata, ZonedDateTime startFrom, String instantTime) throws IOException, URISyntaxException { HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, isParquetSchemaSimple, useSchemaFromCommitMetadata, startFrom, instantTime); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); + createdTablesSet.add(DB_NAME + "." + TABLE_NAME); createCommitFile(commitMetadata, instantTime); } @@ -256,7 +266,7 @@ public static void addCOWPartition(String partitionPath, boolean isParquetSchema boolean useSchemaFromCommitMetadata, String instantTime) throws IOException, URISyntaxException { HoodieCommitMetadata commitMetadata = createPartition(partitionPath, isParquetSchemaSimple, useSchemaFromCommitMetadata, instantTime); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); + createdTablesSet.add(DB_NAME + "." + TABLE_NAME); createCommitFile(commitMetadata, instantTime); } @@ -265,8 +275,8 @@ public static void addMORPartitions(int numberOfPartitions, boolean isParquetSch throws IOException, URISyntaxException, InterruptedException { HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, isParquetSchemaSimple, useSchemaFromCommitMetadata, startFrom, instantTime); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE); + createdTablesSet.add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE); + createdTablesSet.add(DB_NAME + "." + TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE); HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata(); commitMetadata.getPartitionToWriteStats() .forEach((key, value) -> value.forEach(l -> compactionMetadata.addWriteStat(key, l))); @@ -305,7 +315,7 @@ private static HoodieCommitMetadata createPartitions(int numberOfPartitions, boo HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); for (int i = 0; i < numberOfPartitions; i++) { String partitionPath = startFrom.format(dtfOut); - Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath); + Path partPath = new Path(basePath + "/" + partitionPath); fileSystem.makeQualified(partPath); fileSystem.mkdirs(partPath); List writeStats = createTestData(partPath, isParquetSchemaSimple, instantTime); @@ -319,7 +329,7 @@ private static HoodieCommitMetadata createPartitions(int numberOfPartitions, boo private static HoodieCommitMetadata createPartition(String partitionPath, boolean isParquetSchemaSimple, boolean useSchemaFromCommitMetadata, String instantTime) throws IOException, URISyntaxException { HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); - Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath); + Path partPath = new Path(basePath + "/" + partitionPath); fileSystem.makeQualified(partPath); fileSystem.mkdirs(partPath); List writeStats = createTestData(partPath, isParquetSchemaSimple, instantTime); @@ -435,7 +445,7 @@ private static void checkResult(boolean result) { public static void createCommitFile(HoodieCommitMetadata commitMetadata, String instantTime) throws IOException { byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); - Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + Path fullPath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(instantTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); @@ -450,7 +460,7 @@ public static void createCommitFileWithSchema(HoodieCommitMetadata commitMetadat private static void createCompactionCommitFile(HoodieCommitMetadata commitMetadata, String instantTime) throws IOException { byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); - Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + Path fullPath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(instantTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); @@ -460,7 +470,7 @@ private static void createCompactionCommitFile(HoodieCommitMetadata commitMetada private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime) throws IOException { byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); - Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + Path fullPath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeDeltaFileName(deltaCommitTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); diff --git a/hudi-sync/hudi-sync-common/pom.xml b/hudi-sync/hudi-sync-common/pom.xml index 1f1abb4f177f1..d62e88593dea7 100644 --- a/hudi-sync/hudi-sync-common/pom.xml +++ b/hudi-sync/hudi-sync-common/pom.xml @@ -45,6 +45,10 @@ org.apache.hadoop hadoop-common + + com.beust + jcommander + diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java index ce4720ac00907..f87153d09d711 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java @@ -42,10 +42,10 @@ import java.util.Map; import java.util.Objects; -public abstract class AbstractSyncHoodieClient { +public abstract class AbstractSyncHoodieClient implements AutoCloseable { private static final Logger LOG = LogManager.getLogger(AbstractSyncHoodieClient.class); - + protected static final String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync"; public static final TypeConverter TYPE_CONVERTOR = new TypeConverter() {}; protected final HoodieTableMetaClient metaClient; @@ -88,7 +88,7 @@ public abstract void createTable(String tableName, MessageType storageSchema, String serdeClass, Map serdeProperties, Map tableProperties); - public abstract boolean doesTableExist(String tableName); + public abstract boolean tableExists(String tableName); public abstract Option getLastCommitTimeSynced(String tableName); @@ -218,6 +218,13 @@ private MessageType readSchemaFromLogFile(Option lastCompactionCo return messageType; } + /** + * Releases any resources used by the client. + */ + @Override + public void close() { + } + /** * Partition Event captures any partition that needs to be added or updated. */ diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java index 6621468ee2e25..51db9cb82dc83 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java @@ -17,17 +17,23 @@ package org.apache.hudi.sync.common; -import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.common.config.TypedProperties; -import java.util.Properties; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; public abstract class AbstractSyncTool { - protected Properties props; - protected FileSystem fileSystem; + protected final Configuration conf; + protected final FileSystem fs; + protected final HoodieSyncConfig syncConfig; + protected TypedProperties props; - public AbstractSyncTool(Properties props, FileSystem fileSystem) { + public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) { this.props = props; - this.fileSystem = fileSystem; + this.conf = conf; + this.fs = fs; + + this.syncConfig = new HoodieSyncConfig(props); } public abstract void syncHoodieTable(); diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveStylePartitionValueExtractor.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HiveStylePartitionValueExtractor.java similarity index 97% rename from hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveStylePartitionValueExtractor.java rename to hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HiveStylePartitionValueExtractor.java index 4bb20f5e5f142..f96e588d01d7e 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveStylePartitionValueExtractor.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HiveStylePartitionValueExtractor.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.hive; +package org.apache.hudi.sync.common; import java.util.Collections; import java.util.List; diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java new file mode 100644 index 0000000000000..67bae8d045362 --- /dev/null +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sync.common; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import com.beust.jcommander.Parameter; + +import java.util.ArrayList; +import java.util.List; + +/** + * Configs needed to sync data into external meta stores, catalogs, etc. + */ +public class HoodieSyncConfig extends HoodieConfig { + + public static final String META_SYNC_BASE_PATH = "meta.sync.base.path"; + + @Parameter(names = {"--database"}, description = "name of the target database in Hive", required = true) + public String databaseName; + + @Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true) + public String tableName; + + @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true) + public String basePath; + + @Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)") + public String baseFileFormat; + + @Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by") + public List partitionFields; + + @Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor " + + "to extract the partition values from HDFS path") + public String partitionValueExtractorClass; + + @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this" + + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter") + public Boolean assumeDatePartitioning; + + @Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing") + public Boolean decodePartition; + + @Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.") + public Boolean isConditionalSync; + + public static final ConfigProperty META_SYNC_ENABLED = ConfigProperty + .key("hoodie.datasource.meta.sync.enable") + .defaultValue("false") + .withDocumentation("Enable Syncing the Hudi Table with an external meta store or data catalog."); + + // ToDo change the prefix of the following configs from hive_sync to meta_sync + public static final ConfigProperty META_SYNC_DATABASE_NAME = ConfigProperty + .key("hoodie.datasource.hive_sync.database") + .defaultValue("default") + .withDocumentation("The name of the destination database that we should sync the hudi table to."); + + // If the table name for the metastore destination is not provided, pick it up from write or table configs. + public static final ConfigProperty META_SYNC_TABLE_NAME = ConfigProperty + .key("hoodie.datasource.hive_sync.table") + .defaultValue("unknown") + .withInferFunction(cfg -> { + if (cfg.contains(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY)) { + return Option.of(cfg.getString(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY)); + } else if (cfg.contains(HoodieTableConfig.HOODIE_TABLE_NAME_KEY)) { + return Option.of(cfg.getString(HoodieTableConfig.HOODIE_TABLE_NAME_KEY)); + } else { + return Option.empty(); + } + }) + .withDocumentation("The name of the destination table that we should sync the hudi table to."); + + public static final ConfigProperty META_SYNC_BASE_FILE_FORMAT = ConfigProperty + .key("hoodie.datasource.hive_sync.base_file_format") + .defaultValue("PARQUET") + .withDocumentation("Base file format for the sync."); + + // If partition fields are not explicitly provided, obtain from the KeyGeneration Configs + public static final ConfigProperty META_SYNC_PARTITION_FIELDS = ConfigProperty + .key("hoodie.datasource.hive_sync.partition_fields") + .defaultValue("") + .withInferFunction(cfg -> { + if (cfg.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)) { + return Option.of(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)); + } else { + return Option.empty(); + } + }) + .withDocumentation("Field in the table to use for determining hive partition columns."); + + // If partition value extraction class is not explicitly provided, configure based on the partition fields. + public static final ConfigProperty META_SYNC_PARTITION_EXTRACTOR_CLASS = ConfigProperty + .key("hoodie.datasource.hive_sync.partition_extractor_class") + .defaultValue(SlashEncodedDayPartitionValueExtractor.class.getName()) + .withInferFunction(cfg -> { + if (!cfg.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)) { + return Option.of(NonPartitionedExtractor.class.getName()); + } else { + int numOfPartFields = cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME).split(",").length; + if (numOfPartFields == 1 + && cfg.contains(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE) + && cfg.getString(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE).equals("true")) { + return Option.of(HiveStylePartitionValueExtractor.class.getName()); + } else { + return Option.of(MultiPartKeysValueExtractor.class.getName()); + } + } + }) + .withDocumentation("Class which implements PartitionValueExtractor to extract the partition values, " + + "default 'SlashEncodedDayPartitionValueExtractor'."); + + public static final ConfigProperty META_SYNC_ASSUME_DATE_PARTITION = ConfigProperty + .key("hoodie.datasource.hive_sync.assume_date_partitioning") + .defaultValue("false") + .withDocumentation("Assume partitioning is yyyy/mm/dd"); + + public static final ConfigProperty META_SYNC_CONDITIONAL_SYNC = ConfigProperty + .key("hoodie.datasource.meta_sync.condition.sync") + .defaultValue(false) + .withDocumentation("If true, only sync on conditions like schema change or partition change."); + + public HoodieSyncConfig(TypedProperties props) { + super(props); + + this.basePath = props.getString(META_SYNC_BASE_PATH, ""); + this.databaseName = getStringOrDefault(META_SYNC_DATABASE_NAME); + this.tableName = getStringOrDefault(META_SYNC_TABLE_NAME); + this.baseFileFormat = getStringOrDefault(META_SYNC_BASE_FILE_FORMAT); + this.partitionFields = props.getStringList(META_SYNC_PARTITION_FIELDS.key(), ",", new ArrayList<>()); + this.partitionValueExtractorClass = getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS); + this.assumeDatePartitioning = getBooleanOrDefault(META_SYNC_ASSUME_DATE_PARTITION); + this.decodePartition = getBooleanOrDefault(KeyGeneratorOptions.URL_ENCODE_PARTITIONING); + this.isConditionalSync = getBooleanOrDefault(META_SYNC_CONDITIONAL_SYNC); + } +} diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/MultiPartKeysValueExtractor.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/MultiPartKeysValueExtractor.java similarity index 98% rename from hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/MultiPartKeysValueExtractor.java rename to hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/MultiPartKeysValueExtractor.java index ae8f63639b768..da587518d6bdf 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/MultiPartKeysValueExtractor.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/MultiPartKeysValueExtractor.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.hive; +package org.apache.hudi.sync.common; import java.util.Collections; import org.apache.hudi.common.util.ValidationUtils; diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/NonPartitionedExtractor.java similarity index 96% rename from hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java rename to hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/NonPartitionedExtractor.java index dc6243910e132..7bc88aefd459a 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/NonPartitionedExtractor.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.hive; +package org.apache.hudi.sync.common; import java.util.ArrayList; import java.util.List; diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/PartitionValueExtractor.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/PartitionValueExtractor.java similarity index 97% rename from hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/PartitionValueExtractor.java rename to hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/PartitionValueExtractor.java index f4820e3162f1d..34239662ef649 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/PartitionValueExtractor.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/PartitionValueExtractor.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.hive; +package org.apache.hudi.sync.common; import java.io.Serializable; import java.util.List; diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedDayPartitionValueExtractor.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/SlashEncodedDayPartitionValueExtractor.java similarity index 98% rename from hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedDayPartitionValueExtractor.java rename to hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/SlashEncodedDayPartitionValueExtractor.java index d6cd3ba24dd74..eeefda61a190a 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedDayPartitionValueExtractor.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/SlashEncodedDayPartitionValueExtractor.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.hive; +package org.apache.hudi.sync.common; import java.time.LocalDateTime; import java.time.ZoneId; diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedHourPartitionValueExtractor.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/SlashEncodedHourPartitionValueExtractor.java similarity index 98% rename from hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedHourPartitionValueExtractor.java rename to hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/SlashEncodedHourPartitionValueExtractor.java index c8207f582abad..694e844a3b085 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedHourPartitionValueExtractor.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/SlashEncodedHourPartitionValueExtractor.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.hive; +package org.apache.hudi.sync.common; import java.time.LocalDateTime; import java.time.ZoneId; diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SyncUtilHelpers.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SyncUtilHelpers.java new file mode 100644 index 0000000000000..7843fdf070129 --- /dev/null +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SyncUtilHelpers.java @@ -0,0 +1,38 @@ +package org.apache.hudi.sync.common.util; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.sync.common.AbstractSyncTool; +import org.apache.hudi.sync.common.HoodieSyncConfig; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; + +public class SyncUtilHelpers { + + /** + * Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information + * with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems. + * + * @param metaSyncClass The class that implements the sync of the metadata. + * @param props property map. + * @param hadoopConfig Hadoop confs. + * @param fs Filesystem used. + * @param targetBasePath The target base path that contains the hoodie table. + * @param baseFileFormat The file format used by the hoodie table (defauls to PARQUET). + * @return + * @throws IOException + */ + public static void createAndSyncHoodieMeta(String metaSyncClass, TypedProperties props, Configuration hadoopConfig, FileSystem fs, + String targetBasePath, String baseFileFormat) { + TypedProperties properties = new TypedProperties(); + properties.putAll(props); + properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH, targetBasePath); + properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, baseFileFormat); + ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncClass, + new Class[] {TypedProperties.class, Configuration.class, FileSystem.class}, + properties, hadoopConfig, fs)).syncHoodieTable(); + } +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java index 81c5caf82142f..434381fdb7d9c 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java @@ -19,6 +19,7 @@ package org.apache.hudi.utilities; import org.apache.hadoop.conf.Configuration; + import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; @@ -105,13 +106,13 @@ public static Source createSource(String sourceClass, TypedProperties cfg, JavaS try { try { return (Source) ReflectionUtils.loadClass(sourceClass, - new Class[]{TypedProperties.class, JavaSparkContext.class, + new Class[] {TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class, HoodieDeltaStreamerMetrics.class}, cfg, jssc, sparkSession, schemaProvider, metrics); } catch (HoodieException e) { return (Source) ReflectionUtils.loadClass(sourceClass, - new Class[]{TypedProperties.class, JavaSparkContext.class, + new Class[] {TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class}, cfg, jssc, sparkSession, schemaProvider); } @@ -121,7 +122,7 @@ public static Source createSource(String sourceClass, TypedProperties cfg, JavaS } public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg, - JavaSparkContext jssc) throws IOException { + JavaSparkContext jssc) throws IOException { try { return StringUtils.isNullOrEmpty(schemaProviderClass) ? null : (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); @@ -397,21 +398,21 @@ public static SchemaProvider getOriginalSchemaProvider(SchemaProvider schemaProv } public static SchemaProviderWithPostProcessor wrapSchemaProviderWithPostProcessor(SchemaProvider provider, - TypedProperties cfg, JavaSparkContext jssc, List transformerClassNames) { + TypedProperties cfg, JavaSparkContext jssc, List transformerClassNames) { if (provider == null) { return null; } - if (provider instanceof SchemaProviderWithPostProcessor) { - return (SchemaProviderWithPostProcessor)provider; + if (provider instanceof SchemaProviderWithPostProcessor) { + return (SchemaProviderWithPostProcessor) provider; } String schemaPostProcessorClass = cfg.getString(Config.SCHEMA_POST_PROCESSOR_PROP, null); boolean enableSparkAvroPostProcessor = Boolean.parseBoolean(cfg.getString(SparkAvroPostProcessor.Config.SPARK_AVRO_POST_PROCESSOR_PROP_ENABLE, "true")); if (transformerClassNames != null && !transformerClassNames.isEmpty() - && enableSparkAvroPostProcessor && StringUtils.isNullOrEmpty(schemaPostProcessorClass)) { + && enableSparkAvroPostProcessor && StringUtils.isNullOrEmpty(schemaPostProcessorClass)) { schemaPostProcessorClass = SparkAvroPostProcessor.class.getName(); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java index 17fecdeccf0fe..78bb30302b287 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java @@ -18,7 +18,6 @@ package org.apache.hudi.utilities.deltastreamer; -import org.apache.hudi.DataSourceUtils; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; @@ -31,16 +30,15 @@ import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; @@ -160,10 +158,9 @@ public void execute() throws IOException { */ private void syncHive() { if (cfg.enableHiveSync) { - HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat); - LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" - + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); - new HiveSyncTool(hiveSyncConfig, new HiveConf(configuration, HiveConf.class), fs).syncHoodieTable(); + props.put(HoodieSyncConfig.META_SYNC_BASE_PATH, cfg.targetBasePath); + props.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, cfg.baseFileFormat); + new HiveSyncTool(props, configuration, fs).syncHoodieTable(); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index eb553c94e43ea..cf8a4caae628a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -41,7 +41,6 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; @@ -51,12 +50,11 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; -import org.apache.hudi.sync.common.AbstractSyncTool; +import org.apache.hudi.sync.common.util.SyncUtilHelpers; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallback; import org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallbackConfig; @@ -77,7 +75,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; @@ -95,7 +92,6 @@ import java.util.HashSet; import java.util.List; import java.util.Objects; -import java.util.Properties; import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -174,7 +170,7 @@ public class DeltaSync implements Serializable { /** * Bag of properties with source, hoodie client, key generator etc. - * + *

* NOTE: These properties are already consolidated w/ CLI provided config-overrides */ private final TypedProperties props; @@ -268,7 +264,7 @@ public void refreshTimeline() throws IOException { SimpleKeyGenerator.class.getName())) .setPreCombineField(cfg.sourceOrderingField) .initTable(new Configuration(jssc.hadoopConfiguration()), - cfg.targetBasePath); + cfg.targetBasePath); } } @@ -320,7 +316,7 @@ public Pair, JavaRDD> syncOnce() throws IOException * Read from Upstream Source and apply transformation if needed. * * @param commitTimelineOpt Timeline with completed commits - * @return Pair>> Input data read from upstream source, consists + * @return Pair>> Input data read from upstream source, consists * of schemaProvider, checkpointStr and hoodieRecord * @throws Exception in case of any Exception */ @@ -333,7 +329,7 @@ public Pair>> readFromSource( HoodieCommitMetadata commitMetadata = HoodieCommitMetadata .fromBytes(commitTimelineOpt.get().getInstantDetails(lastCommit.get()).get(), HoodieCommitMetadata.class); if (cfg.checkpoint != null && (StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)) - || !cfg.checkpoint.equals(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)))) { + || !cfg.checkpoint.equals(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)))) { resumeCheckpointStr = Option.of(cfg.checkpoint); } else if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_KEY))) { //if previous checkpoint is an empty string, skip resume use Option.empty() @@ -418,7 +414,8 @@ public Pair>> readFromSource( targetSchemaProvider = UtilHelpers.createRowBasedSchemaProvider(r.schema(), props, jssc); } return (SchemaProvider) new DelegatingSchemaProvider(props, jssc, - dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider); }) + dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider); + }) .orElse(dataAndCheckpoint.getSchemaProvider()); avroRDDOptional = transformed .map(t -> HoodieSparkUtils.createRdd( @@ -437,7 +434,7 @@ public Pair>> readFromSource( if (Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) { LOG.info("No new data, source checkpoint has not changed. Nothing to commit. Old checkpoint=(" - + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")"); + + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")"); return null; } @@ -612,43 +609,16 @@ private void syncMeta(HoodieDeltaStreamerMetrics metrics) { LOG.info("When set --enable-hive-sync will use HiveSyncTool for backward compatibility"); } if (cfg.enableMetaSync) { + FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()); for (String impl : syncClientToolClasses) { Timer.Context syncContext = metrics.getMetaSyncTimerContext(); - impl = impl.trim(); - switch (impl) { - case "org.apache.hudi.hive.HiveSyncTool": - syncHive(); - break; - default: - FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()); - Properties properties = new Properties(); - properties.putAll(props); - properties.put("basePath", cfg.targetBasePath); - properties.put("baseFileFormat", cfg.baseFileFormat); - AbstractSyncTool syncTool = (AbstractSyncTool) ReflectionUtils.loadClass(impl, new Class[]{Properties.class, FileSystem.class}, properties, fs); - syncTool.syncHoodieTable(); - } + SyncUtilHelpers.createAndSyncHoodieMeta(impl.trim(), props, conf, fs, cfg.targetBasePath, cfg.baseFileFormat); long metaSyncTimeMs = syncContext != null ? syncContext.stop() : 0; metrics.updateDeltaStreamerMetaSyncMetrics(getSyncClassShortName(impl), metaSyncTimeMs); } } } - public void syncHive() { - HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat); - LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" - + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); - HiveConf hiveConf = new HiveConf(conf, HiveConf.class); - LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString()); - LOG.info("Hive Sync Conf => " + hiveSyncConfig.toString()); - new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable(); - } - - public void syncHive(HiveConf conf) { - this.conf = conf; - syncHive(); - } - /** * Note that depending on configs and source-type, schemaProvider could either be eagerly or lazily created. * SchemaProvider creation is a precursor to HoodieWriteClient and AsyncCompactor creation. This method takes care of diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java index dc150803e8b38..d1376e21f62ba 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java @@ -19,7 +19,6 @@ package org.apache.hudi.utilities.deltastreamer; import com.beust.jcommander.Parameter; -import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.client.utils.OperationConverter; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; @@ -28,6 +27,7 @@ import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.utilities.IdentitySplitter; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.utilities.UtilHelpers; @@ -128,7 +128,7 @@ private void populateTableExecutionContextList(TypedProperties properties, Strin Helpers.deepCopyConfigs(config, cfg); String overriddenTargetBasePath = tableProperties.getString(Constants.TARGET_BASE_PATH_PROP, ""); cfg.targetBasePath = StringUtils.isNullOrEmpty(overriddenTargetBasePath) ? targetBasePath : overriddenTargetBasePath; - if (cfg.enableMetaSync && StringUtils.isNullOrEmpty(tableProperties.getString(DataSourceWriteOptions.HIVE_TABLE().key(), ""))) { + if (cfg.enableMetaSync && StringUtils.isNullOrEmpty(tableProperties.getString(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), ""))) { throw new HoodieException("Meta sync table field not provided!"); } populateSchemaProviderProps(cfg, tableProperties); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java index a217e6b7a8009..2c744edbcec07 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.sync.common.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.IncrSourceHelper; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java index 06898db92c81c..98b445654a9f9 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java @@ -18,7 +18,6 @@ package org.apache.hudi.utilities.functional; -import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.WriteOperationType; @@ -28,7 +27,8 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; -import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.sync.common.MultiPartKeysValueExtractor; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; @@ -176,11 +176,11 @@ protected static void writeCommonPropsToFile(FileSystem dfs, String dfsBasePath) props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); // Hive Configs - props.setProperty(DataSourceWriteOptions.HIVE_URL().key(), "jdbc:hive2://127.0.0.1:9999/"); - props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), "testdb1"); - props.setProperty(DataSourceWriteOptions.HIVE_TABLE().key(), "hive_trips"); - props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "datestr"); - props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/"); + props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb1"); + props.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), "hive_trips"); + props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); + props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getName()); UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE); } @@ -235,11 +235,11 @@ protected static void populateCommonKafkaProps(TypedProperties props, String bro protected static void populateCommonHiveProps(TypedProperties props) { // Hive Configs - props.setProperty(DataSourceWriteOptions.HIVE_URL().key(), "jdbc:hive2://127.0.0.1:9999/"); - props.setProperty(DataSourceWriteOptions.HIVE_DATABASE().key(), "testdb2"); - props.setProperty(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION().key(), "false"); - props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "datestr"); - props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(), + props.setProperty(HiveSyncConfig.HIVE_URL.key(), "jdbc:hive2://127.0.0.1:9999/"); + props.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb2"); + props.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "false"); + props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); + props.setProperty(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getName()); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java index b3cf7b28fa20d..f1dbd5c11be55 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java @@ -1079,7 +1079,7 @@ public void testHoodieAsyncClusteringJobWithScheduleAndExecute(String runningMod public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception { String tableBasePath = dfsBasePath + "/test_table2"; String downstreamTableBasePath = dfsBasePath + "/test_downstream_table2"; - + HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(tableBasePath, "hive_trips"); // Initial bulk insert to ingest to first hudi table @@ -1146,7 +1146,7 @@ public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() t // Test Hive integration HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs); - assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName), "Table " + hiveSyncConfig.tableName + " should exist"); + assertTrue(hiveClient.tableExists(hiveSyncConfig.tableName), "Table " + hiveSyncConfig.tableName + " should exist"); assertEquals(1, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), "Table partitions should match the number of partitions we wrote"); assertEquals(lastInstantForUpstreamTable,