diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java index 3301c1402d639..4fcde21087e9a 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java @@ -24,9 +24,9 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.TableHeader; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Reader; @@ -83,7 +83,7 @@ public String showArchivedCommits( // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - blk.getRecordIterator(HoodieAvroIndexedRecord::new).forEachRemaining(r -> readRecords.add((IndexedRecord) r.getData())); + blk.getRecordIterator(HoodieRecordType.AVRO).forEachRemaining(r -> readRecords.add((IndexedRecord) r.getData())); } List readCommits = readRecords.stream().map(r -> (GenericRecord) r) .filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) @@ -157,8 +157,8 @@ public String showCommits( // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - try (ClosableIterator recordItr = blk.getRecordIterator(HoodieAvroIndexedRecord::new)) { - recordItr.forEachRemaining(r -> readRecords.add((IndexedRecord) r.getData())); + try (ClosableIterator> recordItr = blk.getRecordIterator(HoodieRecordType.AVRO)) { + recordItr.forEachRemaining(r -> readRecords.add(r.getData())); } } List readCommits = readRecords.stream().map(r -> (GenericRecord) r) diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java index 34fb0aab73a08..686a0dd068283 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java @@ -25,9 +25,9 @@ import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Reader; @@ -125,9 +125,9 @@ private int copyArchivedInstants(List statuses, Set actionSe // read the avro blocks while (reader.hasNext() && copyCount < limit) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - try (ClosableIterator recordItr = blk.getRecordIterator(HoodieAvroIndexedRecord::new)) { + try (ClosableIterator> recordItr = blk.getRecordIterator(HoodieRecordType.AVRO)) { while (recordItr.hasNext()) { - IndexedRecord ir = (IndexedRecord) recordItr.next().getData(); + IndexedRecord ir = recordItr.next().getData(); // Archived instants are saved as arvo encoded HoodieArchivedMetaEntry records. We need to get the // metadata record from the entry and convert it to json. HoodieArchivedMetaEntry archiveEntryRecord = (HoodieArchivedMetaEntry) SpecificData.get() diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java index 17d413fb1f8f8..8068ea7a76297 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java @@ -25,8 +25,10 @@ import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.log.HoodieLogFormat; @@ -38,6 +40,7 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieMemoryConfig; @@ -125,7 +128,7 @@ public String showLogFileCommits( instantTime = "dummy_instant_time_" + dummyInstantTimeCount; } if (n instanceof HoodieDataBlock) { - try (ClosableIterator recordItr = ((HoodieDataBlock) n).getRecordIterator(HoodieAvroIndexedRecord::new)) { + try (ClosableIterator> recordItr = ((HoodieDataBlock) n).getRecordIterator(HoodieRecordType.AVRO)) { recordItr.forEachRemaining(r -> recordCount.incrementAndGet()); } } @@ -221,11 +224,12 @@ public String showLogFileRecords( .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); for (HoodieRecord hoodieRecord : scanner) { - Option record = hoodieRecord.toIndexedRecord(readerSchema, new Properties()); + Option record = hoodieRecord.toIndexedRecord(readerSchema, new Properties()); if (allRecords.size() < limit) { - allRecords.add(record.get()); + allRecords.add(record.get().getData()); } } } else { @@ -239,10 +243,10 @@ public String showLogFileRecords( HoodieLogBlock n = reader.next(); if (n instanceof HoodieDataBlock) { HoodieDataBlock blk = (HoodieDataBlock) n; - try (ClosableIterator recordItr = blk.getRecordIterator(HoodieAvroIndexedRecord::new)) { + try (ClosableIterator> recordItr = blk.getRecordIterator(HoodieRecordType.AVRO)) { recordItr.forEachRemaining(record -> { if (allRecords.size() < limit) { - allRecords.add((IndexedRecord) record.getData()); + allRecords.add(record.getData()); } }); } diff --git a/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala b/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala index ddb6b6cf2f8d9..0181b0733cc11 100644 --- a/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala +++ b/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala @@ -23,14 +23,14 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hudi.avro.HoodieAvroWriteSupport import org.apache.hudi.client.SparkTaskContextSupplier import org.apache.hudi.common.bloom.{BloomFilter, BloomFilterFactory} +import org.apache.hudi.common.config.HoodieStorageConfig import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord} import org.apache.hudi.common.util.BaseFileUtils -import org.apache.hudi.config.{HoodieIndexConfig, HoodieStorageConfig} +import org.apache.hudi.config.HoodieIndexConfig import org.apache.hudi.io.storage.{HoodieAvroParquetWriter, HoodieParquetConfig} import org.apache.parquet.avro.AvroSchemaConverter import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.{DataFrame, SQLContext} - import scala.collection.JavaConversions._ import scala.collection.mutable._ diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java index efe89b69eff1e..c26b7efcb23c5 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java @@ -29,6 +29,7 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -38,6 +39,7 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieMemoryConfig; @@ -222,6 +224,7 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); Iterator records = scanner.iterator(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java index 16db94bffd6a6..f4ab9fe703739 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java @@ -32,6 +32,7 @@ import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -344,7 +345,7 @@ public void mergeArchiveFiles(List compactCandidate) throws IOExcept // Read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - blk.getRecordIterator(HoodieAvroIndexedRecord::new).forEachRemaining(r -> records.add((IndexedRecord) r.getData())); + blk.getRecordIterator(HoodieRecordType.AVRO).forEachRemaining(r -> records.add((IndexedRecord) r.getData())); if (records.size() >= this.config.getCommitArchivalBatchSize()) { writeToFile(wrapperSchema, records); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java index 1cf1702717295..636e0e4b2603a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -49,8 +50,9 @@ public FullRecordBootstrapDataProvider(TypedProperties props, HoodieEngineContex * @param tableName Hudi Table Name * @param sourceBasePath Source Base Path * @param partitionPaths Partition Paths + * @param config config * @return input records */ public abstract I generateInputRecords(String tableName, - String sourceBasePath, List>> partitionPaths); + String sourceBasePath, List>> partitionPaths, HoodieWriteConfig config); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java index 52e411108f3d2..35ca3d6d5adec 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java @@ -21,62 +21,32 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodiePayloadConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; import java.io.IOException; import java.util.Iterator; -import java.util.stream.StreamSupport; +import java.util.Properties; /** * Reads records from base file and merges any updates from log files and provides iterable over all records in the file slice. */ public class HoodieFileSliceReader implements Iterator> { + private final Iterator> recordsIterator; public static HoodieFileSliceReader getFileSliceReader( - Option baseFileReader, HoodieMergedLogRecordScanner scanner, Schema schema, String payloadClass, - String preCombineField, Option> simpleKeyGenFieldsOpt) throws IOException { + Option baseFileReader, HoodieMergedLogRecordScanner scanner, Schema schema, Properties props, Option> simpleKeyGenFieldsOpt) throws IOException { if (baseFileReader.isPresent()) { - Iterator baseIterator = baseFileReader.get().getRecordIterator(schema); + Iterator baseIterator = baseFileReader.get().getRecordIterator(schema); while (baseIterator.hasNext()) { - GenericRecord record = (GenericRecord) baseIterator.next(); - HoodieRecord hoodieRecord = transform( - record, scanner, payloadClass, preCombineField, simpleKeyGenFieldsOpt); - scanner.processNextRecord(hoodieRecord); + scanner.processNextRecord(baseIterator.next().wrapIntoHoodieRecordPayloadWithParams(schema, props, + simpleKeyGenFieldsOpt, scanner.isWithOperationField(), scanner.getPartitionName(), false)); } - return new HoodieFileSliceReader(scanner.iterator()); - } else { - Iterable iterable = () -> scanner.iterator(); - HoodiePayloadConfig payloadConfig = HoodiePayloadConfig.newBuilder().withPayloadOrderingField(preCombineField).build(); - return new HoodieFileSliceReader(StreamSupport.stream(iterable.spliterator(), false) - .map(e -> { - try { - GenericRecord record = (GenericRecord) e.toIndexedRecord(schema, payloadConfig.getProps()).get(); - return transform(record, scanner, payloadClass, preCombineField, simpleKeyGenFieldsOpt); - } catch (IOException io) { - throw new HoodieIOException("Error while creating reader for file slice with no base file.", io); - } - }).iterator()); } - } - - private static HoodieRecord transform(GenericRecord record, - HoodieMergedLogRecordScanner scanner, - String payloadClass, - String preCombineField, - Option> simpleKeyGenFieldsOpt) { - return simpleKeyGenFieldsOpt.isPresent() - ? SpillableMapUtils.convertToHoodieRecordPayload(record, - payloadClass, preCombineField, simpleKeyGenFieldsOpt.get(), scanner.isWithOperationField(), Option.empty()) - : SpillableMapUtils.convertToHoodieRecordPayload(record, - payloadClass, preCombineField, scanner.isWithOperationField(), scanner.getPartitionName()); + return new HoodieFileSliceReader(scanner.iterator()); } private HoodieFileSliceReader(Iterator> recordsItr) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java index 8a3e7f3db2609..3b61d49727290 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java @@ -22,7 +22,6 @@ import org.apache.hudi.common.config.ConfigGroups; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.HoodieConfig; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; import org.apache.hudi.table.action.compact.CompactionTriggerStrategy; import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; import org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy; @@ -120,12 +119,6 @@ public class HoodieCompactionConfig extends HoodieConfig { + "compaction during each compaction run. By default. Hudi picks the log file " + "with most accumulated unmerged data"); - public static final ConfigProperty MERGE_CLASS_NAME = ConfigProperty - .key("hoodie.compaction.merge.class") - .defaultValue(HoodieAvroRecordMerge.class.getName()) - .withDocumentation("Merge class provide stateless component interface for merging records, and support various HoodieRecord " - + "types, such as Spark records or Flink records."); - public static final ConfigProperty COMPACTION_LAZY_BLOCK_READ_ENABLE = ConfigProperty .key("hoodie.compaction.lazy.block.read") .defaultValue("true") @@ -359,11 +352,6 @@ public Builder withCompactionStrategy(CompactionStrategy compactionStrategy) { return this; } - public Builder withMergeClass(String mergeClass) { - compactionConfig.setValue(MERGE_CLASS_NAME, mergeClass); - return this; - } - public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { compactionConfig.setValue(TARGET_IO_PER_COMPACTION_IN_MB, String.valueOf(targetIOPerCompactionInMB)); return this; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index cc5428f4e64c9..b04f57ada5993 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -29,14 +29,16 @@ import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.HoodieMetastoreConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FileSystemRetryConfig; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.WriteConcurrencyMode; @@ -45,6 +47,7 @@ import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; @@ -126,11 +129,17 @@ public class HoodieWriteConfig extends HoodieConfig { .withDocumentation("Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. " + "This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective"); - public static final ConfigProperty MERGE_CLASS_NAME = ConfigProperty - .key("hoodie.datasource.write.merge.class") - .defaultValue(HoodieAvroRecordMerge.class.getName()) - .withDocumentation("Merge class provide stateless component interface for merging records, and support various HoodieRecord " - + "types, such as Spark records or Flink records."); + public static final ConfigProperty MERGER_IMPLS = ConfigProperty + .key("hoodie.datasource.write.merger.impls") + .defaultValue(HoodieAvroRecordMerger.class.getName()) + .withDocumentation("List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. " + + "These merger impls will filter by hoodie.datasource.write.merger.strategy " + + "Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)"); + + public static final ConfigProperty MERGER_STRATEGY = ConfigProperty + .key("hoodie.datasource.write.merger.strategy") + .defaultValue(StringUtils.DEFAULT_MERGER_STRATEGY_UUID) + .withDocumentation("Id of merger strategy. Hudi will pick RecordMergers in hoodie.datasource.write.merger.impls which has the same merger strategy id"); public static final ConfigProperty KEYGENERATOR_CLASS_NAME = ConfigProperty .key("hoodie.datasource.write.keygenerator.class") @@ -505,7 +514,9 @@ public class HoodieWriteConfig extends HoodieConfig { private HoodieMetadataConfig metadataConfig; private HoodieMetastoreConfig metastoreConfig; private HoodieCommonConfig commonConfig; + private HoodieStorageConfig storageConfig; private EngineType engineType; + private HoodieRecordMerger recordMerger; /** * @deprecated Use {@link #TBL_NAME} and its methods instead @@ -882,6 +893,7 @@ protected HoodieWriteConfig() { super(); this.engineType = EngineType.SPARK; this.clientSpecifiedViewStorageConfig = null; + applyMergerClass(); } protected HoodieWriteConfig(EngineType engineType, Properties props) { @@ -889,6 +901,7 @@ protected HoodieWriteConfig(EngineType engineType, Properties props) { Properties newProps = new Properties(); newProps.putAll(props); this.engineType = engineType; + applyMergerClass(); this.consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().fromProperties(newProps).build(); this.fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().fromProperties(newProps).build(); this.clientSpecifiedViewStorageConfig = FileSystemViewStorageConfig.newBuilder().fromProperties(newProps).build(); @@ -897,6 +910,16 @@ protected HoodieWriteConfig(EngineType engineType, Properties props) { this.metadataConfig = HoodieMetadataConfig.newBuilder().fromProperties(props).build(); this.metastoreConfig = HoodieMetastoreConfig.newBuilder().fromProperties(props).build(); this.commonConfig = HoodieCommonConfig.newBuilder().fromProperties(props).build(); + this.storageConfig = HoodieStorageConfig.newBuilder().fromProperties(props).build(); + } + + private void applyMergerClass() { + List mergers = getSplitStringsOrDefault(MERGER_IMPLS).stream() + .map(String::trim) + .distinct() + .collect(Collectors.toList()); + String mergerStrategy = getString(MERGER_STRATEGY); + this.recordMerger = HoodieRecordUtils.generateRecordMerger(getString(BASE_PATH), engineType, mergers, mergerStrategy); } public static HoodieWriteConfig.Builder newBuilder() { @@ -910,6 +933,10 @@ public String getBasePath() { return getString(BASE_PATH); } + public HoodieRecordMerger getRecordMerger() { + return recordMerger; + } + public String getSchema() { return getString(AVRO_SCHEMA_STRING); } @@ -918,6 +945,10 @@ public void setSchema(String schemaStr) { setValue(AVRO_SCHEMA_STRING, schemaStr); } + public void setMergerClass(String mergerStrategy) { + setValue(MERGER_STRATEGY, mergerStrategy); + } + public String getInternalSchema() { return getString(INTERNAL_SCHEMA_STRING); } @@ -1339,10 +1370,6 @@ public String getPayloadClass() { return getString(HoodiePayloadConfig.PAYLOAD_CLASS_NAME); } - public String getMergeClass() { - return getString(HoodieCompactionConfig.MERGE_CLASS_NAME); - } - public int getTargetPartitionsPerDayBasedCompaction() { return getInt(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION); } @@ -1963,6 +1990,10 @@ public HoodieCommonConfig getCommonConfig() { return commonConfig; } + public HoodieStorageConfig getStorageConfig() { + return storageConfig; + } + /** * Commit call back configs. */ @@ -2262,6 +2293,16 @@ public Builder withWritePayLoad(String payload) { return this; } + public Builder withMergerImpls(String mergerImpls) { + writeConfig.setValue(MERGER_IMPLS, mergerImpls); + return this; + } + + public Builder withMergerStrategy(String mergerStrategy) { + writeConfig.setValue(MERGER_STRATEGY, mergerStrategy); + return this; + } + public Builder withKeyGenerator(String keyGeneratorClass) { writeConfig.setValue(KEYGENERATOR_CLASS_NAME, keyGeneratorClass); return this; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java index e5f8d901c853e..ad0cb61fd3481 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.HoodieTimer; @@ -31,7 +32,7 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.table.HoodieTable; import org.apache.log4j.LogManager; @@ -132,7 +133,7 @@ public static List filterKeysFromFile(Path filePath, List candid // Load all rowKeys from the file, to double-confirm if (!candidateRecordKeys.isEmpty()) { HoodieTimer timer = new HoodieTimer().startTimer(); - HoodieAvroFileReader fileReader = HoodieFileReaderFactory.getFileReader(configuration, filePath); + HoodieFileReader fileReader = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(configuration, filePath); Set fileRowKeys = fileReader.filterRowKeys(new TreeSet<>(candidateRecordKeys)); foundRecordKeys.addAll(fileRowKeys); LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index b4f252db3a11c..3d2e830c46df8 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -28,6 +28,7 @@ import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.DeleteRecord; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieDeltaWriteStat; import org.apache.hudi.common.model.HoodieLogFile; @@ -36,7 +37,6 @@ import org.apache.hudi.common.model.HoodiePayloadProps; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; import org.apache.hudi.common.model.IOType; import org.apache.hudi.common.table.log.AppendResult; @@ -217,13 +217,18 @@ private Option prepareRecord(HoodieRecord hoodieRecord) { boolean nullifyPayload = HoodieOperation.isDelete(hoodieRecord.getOperation()) && !config.allowOperationMetadataField(); recordProperties.put(HoodiePayloadProps.PAYLOAD_IS_UPDATE_RECORD_FOR_MOR, String.valueOf(isUpdateRecord)); Option finalRecord = Option.empty(); - if (!nullifyPayload && hoodieRecord.isPresent(tableSchema, recordProperties)) { + if (!nullifyPayload && !hoodieRecord.isDelete(tableSchema, recordProperties)) { if (hoodieRecord.shouldIgnore(tableSchema, recordProperties)) { return Option.of(hoodieRecord); } // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema - HoodieRecord rewrittenRecord = hoodieRecord.rewriteRecord(tableSchema, recordProperties, schemaOnReadEnabled, writeSchemaWithMetaFields); - HoodieRecord populatedRecord = populateMetadataFields(rewrittenRecord, tableSchema, recordProperties); + HoodieRecord rewrittenRecord; + if (schemaOnReadEnabled) { + rewrittenRecord = hoodieRecord.rewriteRecordWithNewSchema(tableSchema, recordProperties, writeSchemaWithMetaFields); + } else { + rewrittenRecord = hoodieRecord.rewriteRecord(tableSchema, recordProperties, writeSchemaWithMetaFields); + } + HoodieRecord populatedRecord = populateMetadataFields(rewrittenRecord, writeSchemaWithMetaFields, recordProperties); finalRecord = Option.of(populatedRecord); if (isUpdateRecord) { updatedRecordsWritten++; @@ -249,21 +254,21 @@ private Option prepareRecord(HoodieRecord hoodieRecord) { } private HoodieRecord populateMetadataFields(HoodieRecord hoodieRecord, Schema schema, Properties prop) throws IOException { - Map metadataValues = new HashMap<>(); + Map metadataValues = new HashMap<>(); String seqId = HoodieRecord.generateSequenceId(instantTime, getPartitionId(), RECORD_COUNTER.getAndIncrement()); if (config.populateMetaFields()) { - metadataValues.put(HoodieRecord.HoodieMetadataField.FILENAME_METADATA_FIELD, fileId); - metadataValues.put(HoodieRecord.HoodieMetadataField.PARTITION_PATH_METADATA_FIELD, partitionPath); - metadataValues.put(HoodieRecord.HoodieMetadataField.RECORD_KEY_METADATA_FIELD, hoodieRecord.getRecordKey()); - metadataValues.put(HoodieRecord.HoodieMetadataField.COMMIT_TIME_METADATA_FIELD, instantTime); - metadataValues.put(HoodieRecord.HoodieMetadataField.COMMIT_SEQNO_METADATA_FIELD, seqId); + metadataValues.put(HoodieRecord.HoodieMetadataField.FILENAME_METADATA_FIELD.getFieldName(), fileId); + metadataValues.put(HoodieRecord.HoodieMetadataField.PARTITION_PATH_METADATA_FIELD.getFieldName(), partitionPath); + metadataValues.put(HoodieRecord.HoodieMetadataField.RECORD_KEY_METADATA_FIELD.getFieldName(), hoodieRecord.getRecordKey()); + metadataValues.put(HoodieRecord.HoodieMetadataField.COMMIT_TIME_METADATA_FIELD.getFieldName(), instantTime); + metadataValues.put(HoodieRecord.HoodieMetadataField.COMMIT_SEQNO_METADATA_FIELD.getFieldName(), seqId); } if (config.allowOperationMetadataField()) { - metadataValues.put(HoodieRecord.HoodieMetadataField.OPERATION_METADATA_FIELD, hoodieRecord.getOperation().getName()); + metadataValues.put(HoodieRecord.HoodieMetadataField.OPERATION_METADATA_FIELD.getFieldName(), hoodieRecord.getOperation().getName()); } - return hoodieRecord.addMetadataValues(schema, prop, metadataValues); + return hoodieRecord.updateValues(schema, prop, metadataValues); } private void initNewStatus() { @@ -376,7 +381,7 @@ private void processAppendResult(AppendResult result, List recordL List indexedRecords = new LinkedList<>(); for (HoodieRecord hoodieRecord : recordList) { - indexedRecords.add((IndexedRecord) hoodieRecord.toIndexedRecord(tableSchema, config.getProps()).get()); + indexedRecords.add(((HoodieAvroIndexedRecord) hoodieRecord.toIndexedRecord(tableSchema, config.getProps()).get()).getData()); } Map> columnRangesMetadataMap = @@ -439,7 +444,7 @@ public boolean canWrite(HoodieRecord record) { @Override protected void doWrite(HoodieRecord record, Schema schema, TypedProperties props) { - Option> recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata(); + Option> recordMetadata = record.getMetadata(); try { init(record); flushToDiskIfRequired(record); @@ -524,7 +529,7 @@ private void writeToBuffer(HoodieRecord record) { record.seal(); } // fetch the ordering val first in case the record was deflated. - final Comparable orderingVal = record.getOrderingValue(); + final Comparable orderingVal = record.getOrderingValue(config.getProps()); Option indexedRecord = prepareRecord(record); if (indexedRecord.isPresent()) { // Skip the ignored record. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java index c428ab0622a66..4f6b0428b26e1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java @@ -18,7 +18,6 @@ package org.apache.hudi.io; -import org.apache.avro.Schema; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieKey; @@ -30,6 +29,7 @@ import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.table.HoodieTable; +import org.apache.avro.Schema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -97,7 +97,7 @@ public void write(HoodieRecord oldRecord) { Schema schema = useWriterSchemaForCompaction ? tableSchemaWithMetaFields : tableSchema; try { // NOTE: We're enforcing preservation of the record metadata to keep existing semantic - writeToFile(new HoodieKey(key, partitionPath), oldRecord, schema, config.getProps(), true); + writeToFile(new HoodieKey(key, partitionPath), oldRecord, schema, config.getPayloadConfig().getProps(), true); } catch (IOException | RuntimeException e) { String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java index 7e55b84135bd2..d6269a4fc9116 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; @@ -99,8 +100,8 @@ public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTa hoodieTable.getPartitionMetafileFormat()); partitionMetadata.trySave(getPartitionId()); createMarkerFile(partitionPath, FSUtils.makeBaseFileName(this.instantTime, this.writeToken, this.fileId, hoodieTable.getBaseFileExtension())); - this.fileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, - writeSchemaWithMetaFields, this.taskContextSupplier); + this.fileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable.getHadoopConf(), config, + writeSchemaWithMetaFields, this.taskContextSupplier, config.getRecordMerger().getRecordType()); } catch (IOException e) { throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e); } @@ -131,17 +132,22 @@ public boolean canWrite(HoodieRecord record) { protected void doWrite(HoodieRecord record, Schema schema, TypedProperties props) { Option> recordMetadata = record.getMetadata(); try { - if (!HoodieOperation.isDelete(record.getOperation()) && record.isPresent(schema, config.getProps())) { + if (!HoodieOperation.isDelete(record.getOperation()) && !record.isDelete(schema, config.getProps())) { if (record.shouldIgnore(schema, config.getProps())) { return; } // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema + HoodieRecord rewriteRecord; + if (schemaOnReadEnabled) { + rewriteRecord = record.rewriteRecordWithNewSchema(schema, config.getProps(), writeSchemaWithMetaFields); + } else { + rewriteRecord = record.rewriteRecord(schema, config.getProps(), writeSchemaWithMetaFields); + } + rewriteRecord = rewriteRecord.updateValues(writeSchemaWithMetaFields, config.getProps(), Collections.singletonMap(HoodieMetadataField.FILENAME_METADATA_FIELD.getFieldName(), path.getName())); if (preserveMetadata) { - fileWriter.write(record.getRecordKey(), record.rewriteRecordWithMetadata( - schema, config.getProps(), schemaOnReadEnabled, writeSchemaWithMetaFields, path.getName()), writeSchemaWithMetaFields); + fileWriter.write(record.getRecordKey(), rewriteRecord, writeSchemaWithMetaFields); } else { - fileWriter.writeWithMetadata(record.getKey(), record.rewriteRecordWithMetadata( - schema, config.getProps(), schemaOnReadEnabled, writeSchemaWithMetaFields, path.getName()), writeSchemaWithMetaFields); + fileWriter.writeWithMetadata(record.getKey(), rewriteRecord, writeSchemaWithMetaFields); } // update the new location of record, so we know where to find it next record.unseal(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java index bc1da8c217d50..b55969d6131c4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java @@ -26,7 +26,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.table.HoodieTable; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -66,7 +66,7 @@ private BloomFilter getBloomFilter() { bloomFilter = hoodieTable.getMetadataTable().getBloomFilter(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()) .orElseThrow(() -> new HoodieIndexException("BloomFilter missing for " + partitionPathFileIDPair.getRight())); } else { - try (HoodieAvroFileReader reader = createNewFileReader()) { + try (HoodieFileReader reader = createNewFileReader()) { bloomFilter = reader.readBloomFilter(); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index fff4aa6d054c4..edbfa747f3cc4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; @@ -40,7 +41,7 @@ import org.apache.hudi.exception.HoodieCorruptedDataException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieUpsertException; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.io.storage.HoodieFileWriter; import org.apache.hudi.keygen.BaseKeyGenerator; @@ -279,7 +280,7 @@ private boolean writeUpdateRecord(HoodieRecord hoodieRecord, HoodieRecord return false; } } - return writeRecord(hoodieRecord, combineRecordOp, schema, config.getProps(), isDelete); + return writeRecord(hoodieRecord, combineRecordOp, schema, config.getPayloadConfig().getProps(), isDelete); } protected void writeInsertRecord(HoodieRecord hoodieRecord) throws IOException { @@ -288,7 +289,7 @@ protected void writeInsertRecord(HoodieRecord hoodieRecord) throws IOExceptio if (hoodieRecord.shouldIgnore(schema, config.getProps())) { return; } - if (writeRecord(hoodieRecord, Option.of(hoodieRecord), schema, config.getProps(), HoodieOperation.isDelete(hoodieRecord.getOperation()))) { + if (writeRecord(hoodieRecord, Option.of(hoodieRecord), schema, config.getPayloadConfig().getProps(), HoodieOperation.isDelete(hoodieRecord.getOperation()))) { insertRecordsWritten++; } } @@ -306,7 +307,7 @@ protected boolean writeRecord(HoodieRecord hoodieRecord, Option return false; } try { - if (combineRecord.isPresent() && combineRecord.get().isPresent(schema, config.getProps()) && !isDelete) { + if (combineRecord.isPresent() && !combineRecord.get().isDelete(schema, config.getProps()) && !isDelete) { writeToFile(hoodieRecord.getKey(), combineRecord.get(), schema, prop, preserveMetadata && useWriterSchemaForCompaction); recordsWritten++; } else { @@ -329,16 +330,16 @@ protected boolean writeRecord(HoodieRecord hoodieRecord, Option * Go through an old record. Here if we detect a newer version shows up, we write the new one to the file. */ public void write(HoodieRecord oldRecord) { - String key = oldRecord.getRecordKey(keyGeneratorOpt); - boolean copyOldRecord = true; Schema schema = useWriterSchemaForCompaction ? tableSchemaWithMetaFields : tableSchema; + boolean copyOldRecord = true; + String key = oldRecord.getRecordKey(keyGeneratorOpt); TypedProperties props = config.getPayloadConfig().getProps(); if (keyToNewRecords.containsKey(key)) { // If we have duplicate records that we are updating, then the hoodie record will be deflated after // writing the first record. So make a copy of the record to be merged HoodieRecord hoodieRecord = keyToNewRecords.get(key).newInstance(); try { - Option combinedRecord = merge.combineAndGetUpdateValue(oldRecord, hoodieRecord, schema, props); + Option combinedRecord = recordMerger.merge(oldRecord, hoodieRecord, schema, props); if (combinedRecord.isPresent() && combinedRecord.get().shouldIgnore(schema, props)) { // If it is an IGNORE_RECORD, just copy the old record, and do not update the new record. @@ -374,12 +375,19 @@ public void write(HoodieRecord oldRecord) { } protected void writeToFile(HoodieKey key, HoodieRecord record, Schema schema, Properties prop, boolean shouldPreserveRecordMetadata) throws IOException { + HoodieRecord rewriteRecord; + if (schemaOnReadEnabled) { + rewriteRecord = record.rewriteRecordWithNewSchema(schema, prop, writeSchemaWithMetaFields); + } else { + rewriteRecord = record.rewriteRecord(schema, prop, writeSchemaWithMetaFields); + } + rewriteRecord = rewriteRecord.updateValues(writeSchemaWithMetaFields, prop, Collections.singletonMap(HoodieMetadataField.FILENAME_METADATA_FIELD.getFieldName(), newFilePath.getName())); if (shouldPreserveRecordMetadata) { // NOTE: `FILENAME_METADATA_FIELD` has to be rewritten to correctly point to the // file holding this record even in cases when overall metadata is preserved - fileWriter.write(key.getRecordKey(), record.rewriteRecordWithMetadata(schema, prop, schemaOnReadEnabled, writeSchemaWithMetaFields, newFilePath.getName()), writeSchemaWithMetaFields); + fileWriter.write(key.getRecordKey(), rewriteRecord, writeSchemaWithMetaFields); } else { - fileWriter.writeWithMetadata(key, record.rewriteRecord(schema, prop, schemaOnReadEnabled, writeSchemaWithMetaFields), writeSchemaWithMetaFields); + fileWriter.writeWithMetadata(key, rewriteRecord, writeSchemaWithMetaFields); } } @@ -444,7 +452,7 @@ public void performMergeDataValidationCheck(WriteStatus writeStatus) { long oldNumWrites = 0; try { - HoodieAvroFileReader reader = HoodieFileReaderFactory.getFileReader(hoodieTable.getHadoopConf(), oldFilePath); + HoodieFileReader reader = HoodieFileReaderFactory.getReaderFactory(this.config.getRecordMerger().getRecordType()).getFileReader(hoodieTable.getHadoopConf(), oldFilePath); oldNumWrites = reader.getTotalRecords(); } catch (IOException e) { throw new HoodieUpsertException("Failed to check for merge data validation", e); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java index 753d1db3010fb..98092fd1e87ab 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java @@ -20,7 +20,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.table.HoodieTable; import java.io.IOException; @@ -36,7 +36,7 @@ public HoodieRangeInfoHandle(HoodieWriteConfig config, HoodieTable h } public String[] getMinMaxKeys() throws IOException { - try (HoodieAvroFileReader reader = createNewFileReader()) { + try (HoodieFileReader reader = createNewFileReader()) { return reader.readMinMaxRecordKeys(); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java index b699e9700c1f6..5f4c10bf965ab 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.table.HoodieTable; @@ -61,8 +61,8 @@ protected HoodieBaseFile getLatestDataFile() { .getLatestBaseFile(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()).get(); } - protected HoodieAvroFileReader createNewFileReader() throws IOException { - return HoodieFileReaderFactory.getFileReader(hoodieTable.getHadoopConf(), + protected HoodieFileReader createNewFileReader() throws IOException { + return HoodieFileReaderFactory.getReaderFactory(this.config.getRecordMerger().getRecordType()).getFileReader(hoodieTable.getHadoopConf(), new Path(getLatestDataFile().getPath())); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java index 9ee6e0884dc04..24d0b0bc07755 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java @@ -27,9 +27,8 @@ import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; @@ -61,7 +60,7 @@ public abstract class HoodieWriteHandle extends HoodieIOHandle hoodieTable, HoodieWriteConfig config, Schema schema, TaskContextSupplier taskContextSupplier) throws IOException { - return HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, schema, taskContextSupplier); + return HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable.getHadoopConf(), config, schema, taskContextSupplier, config.getRecordMerger().getRecordType()); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java deleted file mode 100644 index 5083ccf6c092c..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hudi.avro.HoodieAvroWriteSupport; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.engine.TaskContextSupplier; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.parquet.avro.AvroSchemaConverter; - -import java.io.IOException; - -import static org.apache.hudi.common.model.HoodieFileFormat.HFILE; -import static org.apache.hudi.common.model.HoodieFileFormat.ORC; -import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; -import static org.apache.hudi.io.storage.HoodieHFileConfig.CACHE_DATA_IN_L1; -import static org.apache.hudi.io.storage.HoodieHFileConfig.DROP_BEHIND_CACHE_COMPACTION; -import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR; -import static org.apache.hudi.io.storage.HoodieHFileConfig.PREFETCH_ON_OPEN; - -public class HoodieFileWriterFactory { - - public static HoodieFileWriter getFileWriter( - String instantTime, Path path, HoodieTable hoodieTable, HoodieWriteConfig config, Schema schema, - TaskContextSupplier taskContextSupplier) throws IOException { - final String extension = FSUtils.getFileExtension(path.getName()); - if (PARQUET.getFileExtension().equals(extension)) { - return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, config.populateMetaFields()); - } - if (HFILE.getFileExtension().equals(extension)) { - return newHFileFileWriter( - instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier); - } - if (ORC.getFileExtension().equals(extension)) { - return newOrcFileWriter( - instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier); - } - throw new UnsupportedOperationException(extension + " format not supported yet."); - } - - private static HoodieAvroFileWriter newParquetFileWriter( - String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable, - TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException { - return newParquetFileWriter(instantTime, path, config, schema, hoodieTable.getHadoopConf(), - taskContextSupplier, populateMetaFields, populateMetaFields); - } - - private static HoodieAvroFileWriter newParquetFileWriter( - String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf, - TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException { - Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter); - - HoodieParquetConfig parquetConfig = new HoodieParquetConfig<>(writeSupport, config.getParquetCompressionCodec(), - config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), - conf, config.getParquetCompressionRatio(), config.parquetDictionaryEnabled()); - - return new HoodieAvroParquetWriter(path, parquetConfig, instantTime, taskContextSupplier, populateMetaFields); - } - - static HoodieAvroFileWriter newHFileFileWriter( - String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf, - TaskContextSupplier taskContextSupplier) throws IOException { - - BloomFilter filter = createBloomFilter(config); - HoodieHFileConfig hfileConfig = new HoodieHFileConfig(conf, - config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(), - HoodieAvroHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, - filter, HFILE_COMPARATOR); - - return new HoodieAvroHFileWriter(instantTime, path, hfileConfig, schema, taskContextSupplier, config.populateMetaFields()); - } - - private static HoodieAvroFileWriter newOrcFileWriter( - String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf, - TaskContextSupplier taskContextSupplier) throws IOException { - BloomFilter filter = createBloomFilter(config); - HoodieOrcConfig orcConfig = new HoodieOrcConfig(conf, config.getOrcCompressionCodec(), - config.getOrcStripeSize(), config.getOrcBlockSize(), config.getOrcMaxFileSize(), filter); - return new HoodieAvroOrcWriter(instantTime, path, orcConfig, schema, taskContextSupplier); - } - - private static BloomFilter createBloomFilter(HoodieWriteConfig config) { - return BloomFilterFactory.createBloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP(), - config.getDynamicBloomFilterMaxNumEntries(), - config.getBloomFilterType()); - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringExecutionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringExecutionStrategy.java index a4b09d006ee21..d2de06883835e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringExecutionStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringExecutionStrategy.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -39,11 +40,13 @@ public abstract class ClusteringExecutionStrategy implements Seriali private final HoodieTable hoodieTable; private final transient HoodieEngineContext engineContext; private final HoodieWriteConfig writeConfig; + protected final HoodieRecordType recordType; public ClusteringExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { this.writeConfig = writeConfig; this.hoodieTable = table; this.engineContext = engineContext; + this.recordType = table.getConfig().getRecordMerger().getRecordType(); } /** diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java index 393ee9ddb698c..a3a3188df9a0a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java @@ -19,31 +19,22 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.utils.MergingIterator; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.table.HoodieTable; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.BinaryEncoder; -import org.apache.avro.io.DecoderFactory; -import org.apache.avro.io.EncoderFactory; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import javax.annotation.Nonnull; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Iterator; @@ -60,32 +51,6 @@ public abstract class BaseMergeHelper { */ public abstract void runMerge(HoodieTable table, HoodieMergeHandle upsertHandle) throws IOException; - protected HoodieRecord transformRecordBasedOnNewSchema(GenericDatumReader gReader, GenericDatumWriter gWriter, - ThreadLocal encoderCache, ThreadLocal decoderCache, - GenericRecord gRec) { - ByteArrayOutputStream inStream = null; - try { - inStream = new ByteArrayOutputStream(); - BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(inStream, encoderCache.get()); - encoderCache.set(encoder); - gWriter.write(gRec, encoder); - encoder.flush(); - - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inStream.toByteArray(), decoderCache.get()); - decoderCache.set(decoder); - GenericRecord transformedRec = gReader.read(null, decoder); - return new HoodieAvroIndexedRecord(transformedRec); - } catch (IOException e) { - throw new HoodieException(e); - } finally { - try { - inStream.close(); - } catch (IOException ioe) { - throw new HoodieException(ioe.getMessage(), ioe); - } - } - } - /** * Create Parquet record iterator that provides a stitched view of record read from skeleton and bootstrap file. * Skeleton file is a representation of the bootstrap file inside the table, with just the bare bone fields needed @@ -99,7 +64,8 @@ protected Iterator getMergingIterator(HoodieTable tabl boolean externalSchemaTransformation) throws IOException { Path externalFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath()); Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf()); - HoodieFileReader bootstrapReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, externalFilePath); + HoodieRecordType recordType = table.getConfig().getRecordMerger().getRecordType(); + HoodieFileReader bootstrapReader = HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(bootstrapFileConfig, externalFilePath); Schema bootstrapReadSchema; if (externalSchemaTransformation) { @@ -109,15 +75,15 @@ protected Iterator getMergingIterator(HoodieTable tabl } return new MergingIterator<>( - reader.getRecordIterator(readerSchema, HoodieAvroIndexedRecord::new), - bootstrapReader.getRecordIterator(bootstrapReadSchema, HoodieAvroIndexedRecord::new), - (oneRecord, otherRecord) -> mergeRecords(oneRecord, otherRecord, readerSchema, mergeHandle.getWriterSchemaWithMetaFields())); + (Iterator) reader.getRecordIterator(readerSchema), + (Iterator) bootstrapReader.getRecordIterator(bootstrapReadSchema), + (oneRecord, otherRecord) -> mergeRecords(oneRecord, otherRecord, mergeHandle.getWriterSchemaWithMetaFields())); } @Nonnull - private static HoodieRecord mergeRecords(HoodieRecord one, HoodieRecord other, Schema readerSchema, Schema writerSchema) { + private static HoodieRecord mergeRecords(HoodieRecord left, HoodieRecord right, Schema targetSchema) { try { - return one.mergeWith(other, readerSchema, writerSchema); + return left.mergeWith(right, targetSchema); } catch (IOException e) { throw new HoodieIOException("Failed to merge records", e); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java index 1efe3d9641c54..9133358b7f2b1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java @@ -19,9 +19,8 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; @@ -30,6 +29,7 @@ import java.time.Duration; import java.time.Instant; +import java.util.Properties; public abstract class BaseWriteHelper { @@ -83,10 +83,10 @@ public I combineOnCondition( */ public I deduplicateRecords( I records, HoodieTable table, int parallelism) { - HoodieMerge merge = HoodieRecordUtils.loadMerge(table.getConfig().getMergeClass()); - return deduplicateRecords(records, table.getIndex(), parallelism, merge); + HoodieRecordMerger recordMerger = table.getConfig().getRecordMerger(); + return deduplicateRecords(records, table.getIndex(), parallelism, recordMerger, table.getConfig().getProps()); } public abstract I deduplicateRecords( - I records, HoodieIndex index, int parallelism, HoodieMerge merge); + I records, HoodieIndex index, int parallelism, HoodieRecordMerger recordMerger, Properties props); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieDeleteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieDeleteHelper.java index f54184abb0169..0d212555ab228 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieDeleteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieDeleteHelper.java @@ -24,8 +24,10 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieEmptyRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieUpsertException; @@ -84,8 +86,15 @@ public HoodieWriteMetadata> execute(String instantTime, dedupedKeys = keys.repartition(parallelism); } - HoodieData> dedupedRecords = - dedupedKeys.map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload())); + HoodieData dedupedRecords; + HoodieRecordType recordType = config.getRecordMerger().getRecordType(); + if (recordType == HoodieRecordType.AVRO) { + // For BWC, will remove when HoodieRecordPayload removed + dedupedRecords = + dedupedKeys.map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload())); + } else { + dedupedRecords = dedupedKeys.map(key -> new HoodieEmptyRecord<>(key, recordType)); + } Instant beginTag = Instant.now(); // perform index loop up to get existing location of records HoodieData> taggedRecords = table.getIndex().tagLocation(dedupedRecords, context, table); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java index 8f3eb67e9d0d8..49edb981a5d0b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java @@ -24,7 +24,6 @@ import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.ClosableIterator; @@ -44,11 +43,7 @@ import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.BinaryEncoder; import org.apache.hadoop.conf.Configuration; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -84,22 +79,25 @@ public void runMerge(HoodieTable>, HoodieData gWriter; - final GenericDatumReader gReader; + // Support schema evolution Schema readSchema; + // These two schema used to replace gWriter and gReader. + // In previous logic, avro record is serialized by gWriter and then is deserialized by gReader. + // Now we converge this logic in record#rewrite. + Schema readerSchema; + Schema writerSchema; + HoodieFileReader reader = HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType()).getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) { - readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema(); - gWriter = new GenericDatumWriter<>(readSchema); - gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields()); + readSchema = reader.getSchema(); + writerSchema = readSchema; + readerSchema = mergeHandle.getWriterSchemaWithMetaFields(); } else { - gReader = null; - gWriter = null; + readerSchema = null; + writerSchema = null; readSchema = mergeHandle.getWriterSchemaWithMetaFields(); } BoundedInMemoryExecutor wrapper = null; - HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); - Option querySchemaOpt = SerDeHelper.fromJson(table.getConfig().getInternalSchema()); boolean needToReWriteRecord = false; Map renameCols = new HashMap<>(); @@ -135,21 +133,22 @@ public void runMerge(HoodieTable>, HoodieData encoderCache = new ThreadLocal<>(); - ThreadLocal decoderCache = new ThreadLocal<>(); wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), readerIterator, new UpdateHandler(mergeHandle), record -> { if (!externalSchemaTransformation) { return record; } - // TODO Other type of record need to change - return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) ((HoodieRecord)record).getData()); + try { + return ((HoodieRecord) record).rewriteRecord(writerSchema, new Properties(), readerSchema); + } catch (IOException e) { + throw new HoodieException(String.format("Failed to rewrite record. WriterSchema: %s; ReaderSchema: %s", writerSchema, readerSchema), e); + } }, table.getPreExecuteRunnable()); wrapper.execute(); } catch (Exception e) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieWriteHelper.java index 57bb511c634ee..ad664606c78ff 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieWriteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieWriteHelper.java @@ -23,11 +23,15 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; +import java.io.IOException; +import java.util.Properties; + public class HoodieWriteHelper extends BaseWriteHelper>, HoodieData, HoodieData, R> { private HoodieWriteHelper() { @@ -49,7 +53,7 @@ protected HoodieData> tag(HoodieData> dedupedRec @Override public HoodieData> deduplicateRecords( - HoodieData> records, HoodieIndex index, int parallelism, HoodieMerge merge) { + HoodieData> records, HoodieIndex index, int parallelism, HoodieRecordMerger recordMerger, Properties props) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); @@ -57,8 +61,13 @@ public HoodieData> deduplicateRecords( Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; return Pair.of(key, record); }).reduceByKey((rec1, rec2) -> { - @SuppressWarnings("unchecked") - HoodieRecord reducedRecord = merge.preCombine(rec1, rec2); + HoodieRecord reducedRecord; + try { + // Precombine do not need schema and do not return null + reducedRecord = recordMerger.merge(rec1, rec2, null, props).get(); + } catch (IOException e) { + throw new HoodieException(String.format("Error to merge two records, %s, %s", rec1, rec2), e); + } HoodieKey reducedKey = rec1.getData().equals(reducedRecord.getData()) ? rec1.getKey() : rec2.getKey(); return reducedRecord.newInstance(reducedKey); }, parallelism).map(Pair::getRight); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java index 69c6b48feaf92..23eab31ab30bc 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java @@ -194,6 +194,7 @@ public List compact(HoodieCompactionHandler compactionHandler, .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) .withOperationField(config.allowOperationMetadataField()) .withPartition(operation.getPartitionPath()) + .withRecordMerger(config.getRecordMerger()) .build(); Option oldDataFileOpt = diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java index 89e3a7738aa45..5fb7065a8a89c 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java @@ -33,7 +33,6 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; import org.apache.hudi.common.model.HoodieAvroRecord; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.FileIOUtils; @@ -105,8 +104,8 @@ protected HoodieAvroHFileWriter createWriter( when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier); when(partitionSupplier.get()).thenReturn(10); - return (HoodieAvroHFileWriter)HoodieFileWriterFactory.newHFileFileWriter( - instantTime, getFilePath(), writeConfig, avroSchema, conf, mockTaskContextSupplier); + return (HoodieAvroHFileWriter)HoodieFileWriterFactory.getFileWriter( + instantTime, getFilePath(), conf, writeConfig.getStorageConfig(), avroSchema, mockTaskContextSupplier, writeConfig.getRecordMerger().getRecordType()); } @Override @@ -229,7 +228,7 @@ public void testReaderGetRecordIterator() throws Exception { IntStream.concat(IntStream.range(40, NUM_RECORDS * 2), IntStream.range(10, 20)) .mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toList()); Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc"); - Iterator iterator = hfileReader.getRecordsByKeysIterator(keys, avroSchema); + Iterator iterator = hfileReader.getIndexedRecordsByKeysIterator(keys, avroSchema); List expectedIds = IntStream.concat(IntStream.range(40, NUM_RECORDS), IntStream.range(10, 20)) @@ -255,18 +254,19 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { List keyPrefixes = Collections.singletonList("key"); Iterator iterator = - hfileReader.getRecordsByKeyPrefixIterator(keyPrefixes, avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(keyPrefixes, avroSchema); List recordsByPrefix = toStream(iterator).map(r -> (GenericRecord)r).collect(Collectors.toList()); - List allRecords = toStream(hfileReader.getRecordIterator(HoodieAvroIndexedRecord::new)).map(r -> (GenericRecord)r.getData()).collect(Collectors.toList()); + List allRecords = toStream(hfileReader.getRecordIterator()) + .map(r -> (GenericRecord) r.getData()).collect(Collectors.toList()); assertEquals(allRecords, recordsByPrefix); // filter for "key1" : entries from key10 to key19 should be matched List expectedKey1s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key1")).collect(Collectors.toList()); iterator = - hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key1"), avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(Collections.singletonList("key1"), avroSchema); recordsByPrefix = StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .map(r -> (GenericRecord)r) @@ -276,7 +276,7 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { // exact match List expectedKey25 = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key25")).collect(Collectors.toList()); iterator = - hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key25"), avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(Collections.singletonList("key25"), avroSchema); recordsByPrefix = StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .map(r -> (GenericRecord)r) @@ -285,7 +285,7 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { // no match. key prefix is beyond entries in file. iterator = - hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key99"), avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(Collections.singletonList("key99"), avroSchema); recordsByPrefix = StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .map(r -> (GenericRecord)r) @@ -294,7 +294,7 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { // no match. but keyPrefix is in between the entries found in file. iterator = - hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key1234"), avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(Collections.singletonList("key1234"), avroSchema); recordsByPrefix = StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .map(r -> (GenericRecord)r) @@ -305,7 +305,7 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { List expectedKey50and1s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key1") || (entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList()); iterator = - hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", "key1"), avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(Arrays.asList("key50", "key1"), avroSchema); recordsByPrefix = StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .map(r -> (GenericRecord)r) @@ -316,7 +316,7 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { List expectedKey50and0s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key0") || (entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList()); iterator = - hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", "key0"), avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(Arrays.asList("key50", "key0"), avroSchema); recordsByPrefix = StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .map(r -> (GenericRecord)r) @@ -328,7 +328,7 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { .filter(entry -> (entry.get("_row_key").toString()).contains("key1") || (entry.get("_row_key").toString()).contains("key0")) .collect(Collectors.toList()); iterator = - hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key1", "key0"), avroSchema); + hfileReader.getIndexedRecordsByKeyPrefixIterator(Arrays.asList("key1", "key0"), avroSchema); recordsByPrefix = StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .map(r -> (GenericRecord)r) diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java index 10e0ce225689c..1bd2a0cf93f43 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java @@ -22,7 +22,8 @@ import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterTypeCode; import org.apache.hudi.common.engine.TaskContextSupplier; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; @@ -70,7 +71,7 @@ protected HoodieAvroOrcWriter createWriter( @Override protected HoodieAvroFileReader createReader( Configuration conf) throws Exception { - return HoodieFileReaderFactory.getFileReader(conf, getFilePath()); + return (HoodieAvroFileReader) HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(conf, getFilePath()); } @Override diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java index cf701bc017ed1..fde1315a34d5c 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java @@ -113,7 +113,7 @@ public void testWriteReadPrimitiveRecord() throws Exception { Configuration conf = new Configuration(); verifyMetadata(conf); verifySchema(conf, schemaPath); - verifySimpleRecords(new TransformIterator(createReader(conf).getRecordIterator(HoodieAvroIndexedRecord::new))); + verifySimpleRecords(createReader(conf).getRecordIterator()); } @Test @@ -140,7 +140,7 @@ public void testWriteReadComplexRecord() throws Exception { Configuration conf = new Configuration(); verifyMetadata(conf); verifySchema(conf, schemaPath); - verifyComplexRecords(new TransformIterator(createReader(conf).getRecordIterator(HoodieAvroIndexedRecord::new))); + verifyComplexRecords(createReader(conf).getRecordIterator()); } @Test @@ -197,10 +197,10 @@ protected void writeFileWithSchemaWithMeta() throws Exception { writer.close(); } - protected void verifySimpleRecords(Iterator iterator) { + protected void verifySimpleRecords(Iterator> iterator) { int index = 0; while (iterator.hasNext()) { - GenericRecord record = (GenericRecord) iterator.next(); + GenericRecord record = (GenericRecord) iterator.next().getData(); String key = "key" + String.format("%02d", index); assertEquals(key, record.get("_row_key").toString()); assertEquals(Integer.toString(index), record.get("time").toString()); @@ -209,10 +209,10 @@ protected void verifySimpleRecords(Iterator iterator) { } } - protected void verifyComplexRecords(Iterator iterator) { + protected void verifyComplexRecords(Iterator> iterator) { int index = 0; while (iterator.hasNext()) { - GenericRecord record = (GenericRecord) iterator.next(); + GenericRecord record = (GenericRecord) iterator.next().getData(); String key = "key" + String.format("%02d", index); assertEquals(key, record.get("_row_key").toString()); assertEquals(Integer.toString(index), record.get("time").toString()); @@ -247,10 +247,10 @@ private void verifyFilterRowKeys(HoodieAvroFileReader hoodieReader) { private void verifyReaderWithSchema(String schemaPath, HoodieAvroFileReader hoodieReader) throws IOException { Schema evolvedSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, schemaPath); - Iterator iter = hoodieReader.getRecordIterator(evolvedSchema); + Iterator> iter = hoodieReader.getRecordIterator(evolvedSchema); int index = 0; while (iter.hasNext()) { - verifyRecord(schemaPath, (GenericRecord) iter.next(), index); + verifyRecord(schemaPath, (GenericRecord) iter.next().getData(), index); index++; } } @@ -268,24 +268,4 @@ private void verifyRecord(String schemaPath, GenericRecord record, int index) { } assertNull(record.get("added_field")); } - - class TransformIterator implements Iterator { - - private final Iterator iter; - - public TransformIterator(Iterator iter) { - this.iter = iter; - } - - @Override - public boolean hasNext() { - return iter.hasNext(); - } - - @Override - public IndexedRecord next() { - return (GenericRecord) iter.next().getData(); - - } - } } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java index a9525760261cd..3ab7ccc9fb744 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java @@ -44,7 +44,7 @@ import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.io.storage.HoodieAvroOrcWriter; import org.apache.hudi.io.storage.HoodieAvroParquetWriter; import org.apache.hudi.io.storage.HoodieOrcConfig; diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java index 7d778008ec2e8..a15a438c7a653 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java @@ -20,7 +20,6 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; @@ -28,21 +27,18 @@ import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.io.HoodieMergeHandle; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.BinaryEncoder; import org.apache.hadoop.conf.Configuration; import java.io.IOException; import java.util.Iterator; import java.util.List; +import java.util.Properties; /** * Flink merge helper. @@ -64,42 +60,47 @@ public static FlinkMergeHelper newInstance() { @Override public void runMerge(HoodieTable>, List, List> table, HoodieMergeHandle>, List, List> mergeHandle) throws IOException { - final GenericDatumWriter gWriter; - final GenericDatumReader gReader; + // Support schema evolution Schema readSchema; + // These two schema used to replace gWriter and gReader. + // In previous logic, avro record is serialized by gWriter and then is deserialized by gReader. + // Now we converge this logic in record#rewrite. + Schema readerSchema; + Schema writerSchema; final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation(); HoodieBaseFile baseFile = mergeHandle.baseFileForMerge(); + Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf()); + HoodieFileReader reader = HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType()).getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) { - readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema(); - gWriter = new GenericDatumWriter<>(readSchema); - gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields()); + readSchema = reader.getSchema(); + writerSchema = readSchema; + readerSchema = mergeHandle.getWriterSchemaWithMetaFields(); } else { - gReader = null; - gWriter = null; + readerSchema = null; + writerSchema = null; readSchema = mergeHandle.getWriterSchemaWithMetaFields(); } BoundedInMemoryExecutor wrapper = null; - Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf()); - HoodieAvroFileReader reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); try { final Iterator readerIterator; if (baseFile.getBootstrapBaseFile().isPresent()) { readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation); } else { - readerIterator = reader.getRecordIterator(readSchema, HoodieAvroIndexedRecord::new); + readerIterator = reader.getRecordIterator(readSchema); } - ThreadLocal encoderCache = new ThreadLocal<>(); - ThreadLocal decoderCache = new ThreadLocal<>(); wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> { if (!externalSchemaTransformation) { return record; } - // TODO Other type of record need to change - return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) ((HoodieRecord)record).getData()); + try { + return ((HoodieRecord) record).rewriteRecord(writerSchema, new Properties(), readerSchema); + } catch (IOException e) { + throw new HoodieException(e); + } }); wrapper.execute(); } catch (Exception e) { diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java index ce6107714dd63..3fde3a54e8a4a 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java @@ -24,18 +24,21 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; +import java.io.IOException; import java.time.Duration; import java.time.Instant; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Properties; import java.util.stream.Collectors; /** @@ -87,21 +90,26 @@ protected List> tag(List> dedupedRecords, Hoodie @Override public List> deduplicateRecords( - List> records, HoodieIndex index, int parallelism, HoodieMerge merge) { + List> records, HoodieIndex index, int parallelism, HoodieRecordMerger recordMerger, Properties props) { // If index used is global, then records are expected to differ in their partitionPath Map>> keyedRecords = records.stream() .collect(Collectors.groupingBy(record -> record.getKey().getRecordKey())); return keyedRecords.values().stream().map(x -> x.stream().reduce((rec1, rec2) -> { - @SuppressWarnings("unchecked") - final HoodieRecord reducedRec = merge.preCombine(rec1, rec2); + HoodieRecord reducedRecord; + try { + // Precombine do not need schema and do not return null + reducedRecord = recordMerger.merge(rec1, rec2, null, props).get(); + } catch (IOException e) { + throw new HoodieException(String.format("Error to merge two records, %s, %s", rec1, rec2), e); + } // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. - boolean choosePrev = rec1 == reducedRec; + boolean choosePrev = rec1 == reducedRecord; HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey(); HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation(); - HoodieRecord hoodieRecord = reducedRec.newInstance(reducedKey, operation); + HoodieRecord hoodieRecord = reducedRecord.newInstance(reducedKey, operation); // reuse the location from the first record. hoodieRecord.setCurrentLocation(rec1.getCurrentLocation()); return hoodieRecord; diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java index 7db20bdf9b7a6..fe398238da562 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java @@ -20,8 +20,6 @@ package org.apache.hudi.client.clustering.run.strategy; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieClusteringGroup; @@ -32,12 +30,9 @@ import org.apache.hudi.common.data.HoodieListData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.ClusteringOperation; -import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.RewriteAvroPayload; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.util.Option; @@ -48,10 +43,8 @@ import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory; import org.apache.hudi.execution.bulkinsert.JavaCustomColumnsSortPartitioner; import org.apache.hudi.io.IOUtils; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; -import org.apache.hudi.keygen.BaseKeyGenerator; -import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -64,6 +57,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.stream.Collectors; import static org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader; @@ -192,15 +186,15 @@ private List> readRecordsForGroupWithLogs(List baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) + Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) ? Option.empty() - : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); + : Option.of(HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); Iterator> fileSliceReader = getFileSliceReader(baseFileReader, scanner, readerSchema, - tableConfig.getPayloadClass(), - tableConfig.getPreCombineField(), + tableConfig.getProps(), tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp()))); fileSliceReader.forEachRemaining(records::add); @@ -220,9 +214,9 @@ private List> readRecordsForGroupBaseFiles(List { try { Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(getWriteConfig().getSchema())); - HoodieAvroFileReader baseFileReader = HoodieFileReaderFactory.getFileReader(getHoodieTable().getHadoopConf(), new Path(clusteringOp.getDataFilePath())); - Iterator recordIterator = baseFileReader.getRecordIterator(readerSchema); - recordIterator.forEachRemaining(record -> records.add(transform(record))); + HoodieFileReader baseFileReader = HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(getHoodieTable().getHadoopConf(), new Path(clusteringOp.getDataFilePath())); + Iterator recordIterator = baseFileReader.getRecordIterator(readerSchema); + recordIterator.forEachRemaining(record -> records.add(record.wrapIntoHoodieRecordPayloadWithKeyGen(new Properties(), Option.empty()))); } catch (IOException e) { throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e); @@ -230,19 +224,4 @@ private List> readRecordsForGroupBaseFiles(List transform(IndexedRecord indexedRecord) { - GenericRecord record = (GenericRecord) indexedRecord; - Option keyGeneratorOpt = Option.empty(); - String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt); - String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt); - HoodieKey hoodieKey = new HoodieKey(key, partition); - - HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); - HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); - return hoodieRecord; - } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaSortAndSizeExecutionStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaSortAndSizeExecutionStrategy.java index 8258c5e3e01d4..9e0a675fceb77 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaSortAndSizeExecutionStrategy.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaSortAndSizeExecutionStrategy.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.io.CreateHandleFactory; import org.apache.hudi.table.HoodieTable; diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java index 4dcd0a6f7b6d7..9593b5e72f6f3 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java @@ -20,7 +20,6 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; @@ -33,16 +32,13 @@ import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.BinaryEncoder; import org.apache.hadoop.conf.Configuration; import java.io.IOException; import java.util.Iterator; import java.util.List; +import java.util.Properties; public class JavaMergeHelper extends BaseMergeHelper>, List, List> { @@ -66,38 +62,43 @@ public void runMerge(HoodieTable>, List, List HoodieMergeHandle>, List, List> mergeHandle = upsertHandle; HoodieBaseFile baseFile = mergeHandle.baseFileForMerge(); - final GenericDatumWriter gWriter; - final GenericDatumReader gReader; + // Support schema evolution Schema readSchema; + // These two schema used to replace gWriter and gReader. + // In previous logic, avro record is serialized by gWriter and then is deserialized by gReader. + // Now we converge this logic in record#rewrite. + Schema readerSchema; + Schema writerSchema; + HoodieFileReader reader = HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType()).getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) { - readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema(); - gWriter = new GenericDatumWriter<>(readSchema); - gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields()); + readSchema = reader.getSchema(); + writerSchema = readSchema; + readerSchema = mergeHandle.getWriterSchemaWithMetaFields(); } else { - gReader = null; - gWriter = null; + readerSchema = null; + writerSchema = null; readSchema = mergeHandle.getWriterSchemaWithMetaFields(); } BoundedInMemoryExecutor wrapper = null; - HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); try { final Iterator readerIterator; if (baseFile.getBootstrapBaseFile().isPresent()) { readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation); } else { - readerIterator = reader.getRecordIterator(readSchema, HoodieAvroIndexedRecord::new); + readerIterator = reader.getRecordIterator(readSchema); } - ThreadLocal encoderCache = new ThreadLocal<>(); - ThreadLocal decoderCache = new ThreadLocal<>(); wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> { if (!externalSchemaTransformation) { return record; } - // TODO Other type of record need to change - return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) ((HoodieRecord)record).getData()); + try { + return ((HoodieRecord) record).rewriteRecord(writerSchema, new Properties(), readerSchema); + } catch (IOException e) { + throw new HoodieException(e); + } }); wrapper.execute(); } catch (Exception e) { diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java index 39c60447c5789..cc3d28fb20953 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java @@ -23,14 +23,17 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; +import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Properties; import java.util.stream.Collectors; public class JavaWriteHelper extends BaseWriteHelper>, @@ -54,7 +57,7 @@ protected List> tag(List> dedupedRecords, Hoodie @Override public List> deduplicateRecords( - List> records, HoodieIndex index, int parallelism, HoodieMerge merge) { + List> records, HoodieIndex index, int parallelism, HoodieRecordMerger recordMerger, Properties props) { boolean isIndexingGlobal = index.isGlobal(); Map>>> keyedRecords = records.stream().map(record -> { HoodieKey hoodieKey = record.getKey(); @@ -64,8 +67,13 @@ public List> deduplicateRecords( }).collect(Collectors.groupingBy(Pair::getLeft)); return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> { - @SuppressWarnings("unchecked") - HoodieRecord reducedRecord = merge.preCombine(rec1,rec2); + HoodieRecord reducedRecord; + try { + // Precombine do not need schema and do not return null + reducedRecord = recordMerger.merge(rec1, rec2, null, props).get(); + } catch (IOException e) { + throw new HoodieException(String.format("Error to merge two records, %s, %s", rec1, rec2), e); + } // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java index 28d3ac6e9bd6e..2c0409a502fb0 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java @@ -35,7 +35,7 @@ import org.apache.hudi.common.util.BaseFileUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java index 9882709d37813..25669efc654bd 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java @@ -29,15 +29,14 @@ import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.ClusteringOperation; -import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.RewriteAvroPayload; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.FutureUtils; +import org.apache.hudi.common.util.MappingIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; @@ -45,15 +44,13 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; -import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner; import org.apache.hudi.io.IOUtils; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.keygen.BaseKeyGenerator; -import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; @@ -61,8 +58,6 @@ import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -78,6 +73,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.table.HoodieTableConfig.POPULATE_META_FIELDS; import static org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader; import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS; @@ -154,7 +150,7 @@ protected BulkInsertPartitioner>> getPartitioner(Map> readRecordsForGroupWithLogs(JavaSparkContext .withPartition(clusteringOp.getPartitionPath()) .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType()) .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .withRecordMerger(config.getRecordMerger()) .build(); - Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) + Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) ? Option.empty() - : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); + : Option.of(HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); recordIterators.add(getFileSliceReader(baseFileReader, scanner, readerSchema, - tableConfig.getPayloadClass(), - tableConfig.getPreCombineField(), + tableConfig.getProps(), tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp())))); } catch (IOException e) { @@ -258,12 +254,20 @@ private HoodieData> readRecordsForGroupBaseFiles(JavaSparkContex // closure, as this might lead to issues attempting to serialize its nested fields return HoodieJavaRDD.of(jsc.parallelize(clusteringOps, clusteringOps.size()) .mapPartitions(clusteringOpsPartition -> { - List> iteratorsForPartition = new ArrayList<>(); + List>> iteratorsForPartition = new ArrayList<>(); clusteringOpsPartition.forEachRemaining(clusteringOp -> { try { Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(writeConfig.getSchema())); - HoodieAvroFileReader baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(clusteringOp.getDataFilePath())); - iteratorsForPartition.add(baseFileReader.getRecordIterator(readerSchema)); + HoodieFileReader baseFileReader = HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(hadoopConf.get(), new Path(clusteringOp.getDataFilePath())); + Option keyGeneratorOp; + if (!Boolean.parseBoolean(writeConfig.getProps().getOrDefault(POPULATE_META_FIELDS.key(), POPULATE_META_FIELDS.defaultValue().toString()).toString())) { + keyGeneratorOp = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(writeConfig.getProps())); + } else { + keyGeneratorOp = Option.empty(); + } + MappingIterator mappingIterator = new MappingIterator((ClosableIterator) baseFileReader.getRecordIterator(readerSchema), + rec -> ((HoodieRecord) rec).wrapIntoHoodieRecordPayloadWithKeyGen(writeConfig.getProps(), keyGeneratorOp)); + iteratorsForPartition.add(mappingIterator); } catch (IOException e) { throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e); @@ -271,8 +275,7 @@ private HoodieData> readRecordsForGroupBaseFiles(JavaSparkContex }); return new ConcatenatingIterator<>(iteratorsForPartition); - }) - .map(record -> transform(record, writeConfig))); + })); } /** @@ -287,26 +290,4 @@ private JavaRDD[] convertStreamToArray(Stream> } return writeStatusRDDArray; } - - /** - * Transform IndexedRecord into HoodieRecord. - */ - private static HoodieRecord transform(IndexedRecord indexedRecord, HoodieWriteConfig writeConfig) { - GenericRecord record = (GenericRecord) indexedRecord; - Option keyGeneratorOpt = Option.empty(); - if (!writeConfig.populateMetaFields()) { - try { - keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(writeConfig.getProps())); - } catch (IOException e) { - throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e); - } - } - String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt); - String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt); - HoodieKey hoodieKey = new HoodieKey(key, partition); - - HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); - HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); - return hoodieRecord; - } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java index c2689921925b2..918542dbeffba 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java @@ -24,34 +24,29 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.utils.ConcatenatingIterator; import org.apache.hudi.common.config.SerializableSchema; -import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.ClusteringGroupInfo; import org.apache.hudi.common.model.ClusteringOperation; -import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.RewriteAvroPayload; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.MappingIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; -import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.keygen.BaseKeyGenerator; -import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; @@ -68,6 +63,8 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import static org.apache.hudi.common.table.HoodieTableConfig.POPULATE_META_FIELDS; + /** * Clustering strategy to submit single spark jobs. * MultipleSparkJobExecution strategy is not ideal for use cases that require large number of clustering groups @@ -151,40 +148,28 @@ private Iterator> readRecordsForGroupBaseFiles(List>> iteratorsForPartition = clusteringOps.stream().map(clusteringOp -> { Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(getWriteConfig().getSchema())); - Iterable indexedRecords = () -> { + Iterable> indexedRecords = () -> { try { - return HoodieFileReaderFactory.getFileReader(getHoodieTable().getHadoopConf(), new Path(clusteringOp.getDataFilePath())).getRecordIterator(readerSchema); + + HoodieFileReader baseFileReader = HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(getHoodieTable().getHadoopConf(), new Path(clusteringOp.getDataFilePath())); + Option keyGeneratorOp; + if (!Boolean.parseBoolean(getWriteConfig().getProps().getOrDefault(POPULATE_META_FIELDS.key(), POPULATE_META_FIELDS.defaultValue().toString()).toString())) { + keyGeneratorOp = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(getWriteConfig().getProps())); + } else { + keyGeneratorOp = Option.empty(); + } + MappingIterator mappingIterator = new MappingIterator((ClosableIterator) baseFileReader.getRecordIterator(readerSchema), + rec -> ((HoodieRecord) rec).wrapIntoHoodieRecordPayloadWithKeyGen(getWriteConfig().getProps(), keyGeneratorOp)); + return mappingIterator; } catch (IOException e) { throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e); } }; - return StreamSupport.stream(indexedRecords.spliterator(), false).map(record -> transform(record)).iterator(); + return StreamSupport.stream(indexedRecords.spliterator(), false).iterator(); }).collect(Collectors.toList()); return new ConcatenatingIterator<>(iteratorsForPartition); } - - /** - * Transform IndexedRecord into HoodieRecord. - */ - private HoodieRecord transform(IndexedRecord indexedRecord) { - GenericRecord record = (GenericRecord) indexedRecord; - Option keyGeneratorOpt = Option.empty(); - if (!getWriteConfig().populateMetaFields()) { - try { - keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getWriteConfig().getProps()))); - } catch (IOException e) { - throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e); - } - } - String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt); - String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt); - HoodieKey hoodieKey = new HoodieKey(key, partition); - - HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); - HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); - return hoodieRecord; - } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java index 0edef9121075a..8a65e49db4a6c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.io.SingleFileHandleCreateFactory; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java index 33b7fad439573..0f71f78a66ca8 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java @@ -23,7 +23,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.io.CreateHandleFactory; import org.apache.hudi.table.HoodieTable; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/model/HoodieInternalRow.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/model/HoodieInternalRow.java index 7425137665746..2577f2601cb6a 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/model/HoodieInternalRow.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/model/HoodieInternalRow.java @@ -84,7 +84,7 @@ public HoodieInternalRow(UTF8String commitTime, this.sourceContainsMetaFields = sourceContainsMetaFields; } - private HoodieInternalRow(UTF8String[] metaFields, + public HoodieInternalRow(UTF8String[] metaFields, InternalRow sourceRow, boolean sourceContainsMetaFields) { this.metaFields = metaFields; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/commmon/model/HoodieSparkRecord.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/commmon/model/HoodieSparkRecord.java new file mode 100644 index 0000000000000..f866914b275f1 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/commmon/model/HoodieSparkRecord.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.commmon.model; + +import org.apache.hudi.HoodieInternalRowUtils; +import org.apache.hudi.client.model.HoodieInternalRow; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ConfigUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.SparkKeyGeneratorInterface; +import org.apache.hudi.util.HoodieSparkRecordUtils; + +import org.apache.avro.Schema; +import org.apache.spark.sql.HoodieCatalystExpressionUtils$; +import org.apache.spark.sql.HoodieUnsafeRowUtils; +import org.apache.spark.sql.HoodieUnsafeRowUtils.NestedFieldPath; +import org.apache.spark.sql.catalyst.CatalystTypeConverters; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.catalyst.expressions.UnsafeProjection; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; + +import java.io.IOException; +import java.util.Map; +import java.util.Properties; + +import static org.apache.hudi.common.table.HoodieTableConfig.POPULATE_META_FIELDS; +import static org.apache.spark.sql.types.DataTypes.BooleanType; +import static org.apache.spark.sql.types.DataTypes.StringType; + +/** + * Spark Engine-specific Implementations of `HoodieRecord`. + */ +public class HoodieSparkRecord extends HoodieRecord { + + private StructType structType = null; + private Option schemaFingerPrint = Option.empty(); + + public HoodieSparkRecord(InternalRow data, StructType schema) { + super(null, data); + initSchema(schema); + } + + public HoodieSparkRecord(HoodieKey key, InternalRow data, StructType schema) { + super(key, data); + initSchema(schema); + } + + public HoodieSparkRecord(HoodieKey key, InternalRow data, StructType schema, HoodieOperation operation) { + super(key, data, operation); + initSchema(schema); + } + + public HoodieSparkRecord(HoodieSparkRecord record) { + super(record); + initSchema(record.getStructType()); + } + + @Override + public HoodieRecord newInstance() { + return new HoodieSparkRecord(this); + } + + @Override + public HoodieRecord newInstance(HoodieKey key, HoodieOperation op) { + return new HoodieSparkRecord(key, data, getStructType(), op); + } + + @Override + public HoodieRecord newInstance(HoodieKey key) { + return new HoodieSparkRecord(key, data, getStructType()); + } + + @Override + public String getRecordKey(Option keyGeneratorOpt) { + if (key != null) { + return getRecordKey(); + } + return keyGeneratorOpt.isPresent() ? ((SparkKeyGeneratorInterface) keyGeneratorOpt.get()) + .getRecordKey(data, getStructType()).toString() : data.getString(HoodieMetadataField.RECORD_KEY_METADATA_FIELD.ordinal()); + } + + @Override + public String getRecordKey(String keyFieldName) { + if (key != null) { + return getRecordKey(); + } + DataType dataType = getStructType().apply(keyFieldName).dataType(); + int pos = getStructType().fieldIndex(keyFieldName); + return data.get(pos, dataType).toString(); + } + + @Override + public HoodieRecordType getRecordType() { + return HoodieRecordType.SPARK; + } + + @Override + public Object getRecordColumnValues(Schema recordSchema, String[] columns, boolean consistentLogicalTimestampEnabled) { + return HoodieSparkRecordUtils.getRecordColumnValues(data, columns, getStructType(), consistentLogicalTimestampEnabled); + } + + @Override + public HoodieRecord mergeWith(HoodieRecord other, Schema targetSchema) throws IOException { + StructType otherStructType = ((HoodieSparkRecord) other).getStructType(); + StructType writerStructType = HoodieInternalRowUtils.getCachedSchema(targetSchema); + InternalRow mergeRow = HoodieInternalRowUtils.stitchRecords(data, getStructType(), (InternalRow) other.getData(), otherStructType, writerStructType); + return new HoodieSparkRecord(getKey(), mergeRow, writerStructType, getOperation()); + } + + @Override + public HoodieRecord rewriteRecord(Schema recordSchema, Properties props, Schema targetSchema) throws IOException { + StructType targetStructType = HoodieInternalRowUtils.getCachedSchema(targetSchema); + UTF8String[] metaFields = extractMetaField(targetStructType); + if (metaFields.length == 0) { + throw new UnsupportedOperationException(); + } + + InternalRow resultRow; + if (extractMetaField(getStructType()).length == 0) { + resultRow = new HoodieInternalRow(metaFields, data, false); + } else { + resultRow = new HoodieInternalRow(metaFields, data, true); + } + + return new HoodieSparkRecord(getKey(), resultRow, targetStructType, getOperation()); + } + + @Override + public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties props, Schema newSchema, Map renameCols) throws IOException { + StructType newStructType = HoodieInternalRowUtils.getCachedSchema(newSchema); + InternalRow rewriteRow = HoodieInternalRowUtils.rewriteRecordWithNewSchema(data, getStructType(), newStructType, renameCols); + UnsafeProjection unsafeConvert = HoodieInternalRowUtils.getCachedUnsafeConvert(newStructType); + InternalRow resultRow = unsafeConvert.apply(rewriteRow); + UTF8String[] metaFields = extractMetaField(newStructType); + if (metaFields.length > 0) { + resultRow = new HoodieInternalRow(metaFields, data, true); + } + + return new HoodieSparkRecord(getKey(), resultRow, newStructType, getOperation()); + } + + @Override + public HoodieRecord updateValues(Schema recordSchema, Properties props, Map metadataValues) throws IOException { + metadataValues.forEach((key, value) -> { + int pos = getStructType().fieldIndex(key); + if (value != null) { + data.update(pos, CatalystTypeConverters.convertToCatalyst(value)); + } + }); + + return new HoodieSparkRecord(getKey(), data, getStructType(), getOperation()); + } + + @Override + public boolean isDelete(Schema schema, Properties props) throws IOException { + if (null == data) { + return true; + } + if (schema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD) == null) { + return false; + } + Object deleteMarker = data.get(schema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD).pos(), BooleanType); + return deleteMarker instanceof Boolean && (boolean) deleteMarker; + } + + @Override + public boolean shouldIgnore(Schema schema, Properties props) throws IOException { + if (data != null && data.equals(SENTINEL)) { + return true; + } else { + return false; + } + } + + @Override + public HoodieRecord wrapIntoHoodieRecordPayloadWithParams( + Schema schema, Properties props, + Option> simpleKeyGenFieldsOpt, + Boolean withOperation, + Option partitionNameOp, + Boolean populateMetaFields) { + if (populateMetaFields) { + return HoodieSparkRecordUtils.convertToHoodieSparkRecord(getStructType(), data, withOperation); + } else if (simpleKeyGenFieldsOpt.isPresent()) { + return HoodieSparkRecordUtils.convertToHoodieSparkRecord(getStructType(), data, simpleKeyGenFieldsOpt.get(), withOperation, Option.empty()); + } else { + return HoodieSparkRecordUtils.convertToHoodieSparkRecord(getStructType(), data, withOperation, partitionNameOp); + } + } + + @Override + public HoodieRecord wrapIntoHoodieRecordPayloadWithKeyGen(Properties props, Option keyGen) { + String key; + String partition; + if (keyGen.isPresent() && !Boolean.parseBoolean(props.getOrDefault(POPULATE_META_FIELDS.key(), POPULATE_META_FIELDS.defaultValue().toString()).toString())) { + SparkKeyGeneratorInterface keyGenerator = (SparkKeyGeneratorInterface) keyGen.get(); + key = keyGenerator.getRecordKey(data, getStructType()).toString(); + partition = keyGenerator.getPartitionPath(data, getStructType()).toString(); + } else { + key = data.get(HoodieMetadataField.RECORD_KEY_METADATA_FIELD.ordinal(), StringType).toString(); + partition = data.get(HoodieMetadataField.PARTITION_PATH_METADATA_FIELD.ordinal(), StringType).toString(); + } + HoodieKey hoodieKey = new HoodieKey(key, partition); + return new HoodieSparkRecord(hoodieKey, data, getStructType(), getOperation()); + } + + @Override + public Option> getMetadata() { + return Option.empty(); + } + + @Override + public Option toIndexedRecord(Schema schema, Properties prop) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Comparable getOrderingValue(Properties props) { + String orderingField = ConfigUtils.getOrderingField(props); + if (!HoodieCatalystExpressionUtils$.MODULE$.existField(getStructType(), orderingField)) { + return 0; + } else { + NestedFieldPath nestedFieldPath = HoodieInternalRowUtils.getCachedPosList(getStructType(), + orderingField); + Comparable value = (Comparable) HoodieUnsafeRowUtils.getNestedInternalRowValue( + data, nestedFieldPath); + return value; + } + } + + public StructType getStructType() { + if (schemaFingerPrint.isPresent()) { + return HoodieInternalRowUtils.getCachedSchemaFromFingerPrint(schemaFingerPrint.get()); + } else { + return structType; + } + } + + private void initSchema(StructType structType) { + if (HoodieInternalRowUtils.containsCompressedSchema(structType)) { + HoodieInternalRowUtils.addCompressedSchema(structType); + this.schemaFingerPrint = Option.of(HoodieInternalRowUtils.getCachedFingerPrintFromSchema(structType)); + } else { + this.structType = structType; + } + } + + public void setStructType(StructType structType) { + if (structType != null) { + initSchema(structType); + } + } + + private UTF8String[] extractMetaField(StructType structType) { + return HOODIE_META_COLUMNS_WITH_OPERATION.stream() + .filter(f -> HoodieCatalystExpressionUtils$.MODULE$.existField(structType, f)) + .map(UTF8String::fromString) + .toArray(UTF8String[]::new); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDConsistentBucketPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDConsistentBucketPartitioner.java index 70258d7938357..94e48c959c662 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDConsistentBucketPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDConsistentBucketPartitioner.java @@ -22,7 +22,6 @@ import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.ConsistentHashingNode; -import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieConsistentHashingMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -235,8 +234,8 @@ private JavaRDD> doPartitionAndCustomColumnSort(JavaRDD> comparator = (Comparator> & Serializable) (t1, t2) -> { - Object obj1 = HoodieAvroUtils.getRecordColumnValues((HoodieAvroRecord) t1, sortColumns, schema, consistentLogicalTimestampEnabled); - Object obj2 = HoodieAvroUtils.getRecordColumnValues((HoodieAvroRecord)t2, sortColumns, schema, consistentLogicalTimestampEnabled); + Object obj1 = t1.getRecordColumnValues(schema.get(), sortColumns, consistentLogicalTimestampEnabled); + Object obj2 = t2.getRecordColumnValues(schema.get(), sortColumns, consistentLogicalTimestampEnabled); return ((Comparable) obj1).compareTo(obj2); }; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDCustomColumnsSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDCustomColumnsSortPartitioner.java index 120677f811050..dbbcb22e90aef 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDCustomColumnsSortPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDCustomColumnsSortPartitioner.java @@ -18,9 +18,7 @@ package org.apache.hudi.execution.bulkinsert; -import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.SerializableSchema; -import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; @@ -63,7 +61,7 @@ public JavaRDD> repartitionRecords(JavaRDD> reco final boolean consistentLogicalTimestampEnabled = this.consistentLogicalTimestampEnabled; return records.sortBy( record -> { - Object recordValue = HoodieAvroUtils.getRecordColumnValues((HoodieAvroRecord)record, sortColumns, schema, consistentLogicalTimestampEnabled); + Object recordValue = record.getRecordColumnValues(schema.get(), sortColumns, consistentLogicalTimestampEnabled); // null values are replaced with empty string for null_first order if (recordValue == null) { return StringUtils.EMPTY_STRING; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java index a5921e5a460cf..1f74e82616f11 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java @@ -19,23 +19,33 @@ package org.apache.hudi.execution.bulkinsert; import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.HoodieInternalRowUtils; import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.SparkConversionUtils; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.commmon.model.HoodieSparkRecord; import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.model.RewriteAvroPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieClusteringConfig.LayoutOptimizationStrategy; +import org.apache.hudi.config.HoodieClusteringConfig.SpatialCurveCompositionStrategyType; import org.apache.hudi.sort.SpaceCurveSortingHelper; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; import java.util.Arrays; import java.util.List; @@ -54,42 +64,65 @@ public class RDDSpatialCurveSortPartitioner private final SerializableSchema schema; private final HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy; private final HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType; + private final HoodieRecordType recordType; public RDDSpatialCurveSortPartitioner(HoodieSparkEngineContext sparkEngineContext, - String[] orderByColumns, - HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, - HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType, - Schema schema) { + String[] orderByColumns, + LayoutOptimizationStrategy layoutOptStrategy, + SpatialCurveCompositionStrategyType curveCompositionStrategyType, + Schema schema, HoodieRecordType recordType) { this.sparkEngineContext = sparkEngineContext; this.orderByColumns = orderByColumns; this.layoutOptStrategy = layoutOptStrategy; this.curveCompositionStrategyType = curveCompositionStrategyType; this.schema = new SerializableSchema(schema); + this.recordType = recordType; } @Override public JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions) { - JavaRDD genericRecordsRDD = - records.map(f -> (GenericRecord) f.toIndexedRecord(schema.get(), new Properties()).get()); - - Dataset sourceDataset = - AvroConversionUtils.createDataFrame( - genericRecordsRDD.rdd(), - schema.toString(), - sparkEngineContext.getSqlContext().sparkSession() - ); - - Dataset sortedDataset = reorder(sourceDataset, outputSparkPartitions); - - return HoodieSparkUtils.createRdd(sortedDataset, schema.get().getName(), schema.get().getNamespace(), false, Option.empty()) - .toJavaRDD() - .map(record -> { - String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieKey hoodieKey = new HoodieKey(key, partition); - HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, new RewriteAvroPayload(record)); - return hoodieRecord; - }); + if (recordType == HoodieRecordType.AVRO) { + JavaRDD genericRecordsRDD = + records.map(f -> (GenericRecord) f.toIndexedRecord(schema.get(), new Properties()).get().getData()); + + Dataset sourceDataset = + AvroConversionUtils.createDataFrame( + genericRecordsRDD.rdd(), + schema.toString(), + sparkEngineContext.getSqlContext().sparkSession() + ); + + Dataset sortedDataset = reorder(sourceDataset, outputSparkPartitions); + + return HoodieSparkUtils.createRdd(sortedDataset, schema.get().getName(), schema.get().getNamespace(), false, Option.empty()) + .toJavaRDD() + .map(record -> { + String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieKey hoodieKey = new HoodieKey(key, partition); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, new RewriteAvroPayload(record)); + return hoodieRecord; + }); + } else if (recordType == HoodieRecordType.SPARK) { + StructType structType = HoodieInternalRowUtils.getCachedSchema(schema.get()); + Broadcast structTypeBC = sparkEngineContext.getJavaSparkContext().broadcast(structType); + Dataset sourceDataset = SparkConversionUtils.createDataFrame(records.rdd(), sparkEngineContext.getSqlContext().sparkSession(), structType); + + Dataset sortedDataset = reorder(sourceDataset, outputSparkPartitions); + + return sortedDataset.queryExecution().toRdd() + .toJavaRDD() + .map(row -> { + InternalRow internalRow = row.copy(); + String key = internalRow.getString(HoodieMetadataField.RECORD_KEY_METADATA_FIELD.ordinal()); + String partition = internalRow.getString(HoodieMetadataField.PARTITION_PATH_METADATA_FIELD.ordinal()); + HoodieKey hoodieKey = new HoodieKey(key, partition); + HoodieRecord hoodieRecord = new HoodieSparkRecord(hoodieKey, internalRow, structTypeBC.value()); + return hoodieRecord; + }); + } else { + throw new UnsupportedOperationException(recordType.name()); + } } private Dataset reorder(Dataset dataset, int numOutputGroups) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReader.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReader.java new file mode 100644 index 0000000000000..86134eda37885 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReader.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hudi.HoodieInternalRowUtils; +import org.apache.hudi.commmon.model.HoodieSparkRecord; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.MappingIterator; + +import org.apache.avro.Schema; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; + +import java.io.IOException; + +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; + +public interface HoodieSparkFileReader extends HoodieFileReader { + + ClosableIterator getInternalRowIterator(Schema readerSchema) throws IOException; + + ClosableIterator getInternalRowIterator(Schema readerSchema, Schema requestedSchema) throws IOException; + + default ClosableIterator> getRecordIterator(Schema readerSchema) throws IOException { + ClosableIterator iterator = getInternalRowIterator(readerSchema); + StructType structType = HoodieInternalRowUtils.getCachedSchema(readerSchema); + return new MappingIterator<>(iterator, data -> unsafeCast(new HoodieSparkRecord(data, structType))); + } + + @Override + default ClosableIterator> getRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException { + ClosableIterator iterator = getInternalRowIterator(readerSchema, requestedSchema); + StructType structType = HoodieInternalRowUtils.getCachedSchema(requestedSchema); + return new MappingIterator<>(iterator, data -> unsafeCast(new HoodieSparkRecord(data, structType))); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java new file mode 100644 index 0000000000000..f0d650ecdc74d --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.exception.HoodieIOException; + +import java.io.IOException; +import org.apache.spark.sql.internal.SQLConf; + +public class HoodieSparkFileReaderFactory extends HoodieFileReaderFactory { + + private static class SingletonHolder { + private static final HoodieSparkFileReaderFactory INSTANCE = new HoodieSparkFileReaderFactory(); + } + + private HoodieSparkFileReaderFactory() { + } + + public static HoodieFileReaderFactory getFileReaderFactory() { + return SingletonHolder.INSTANCE; + } + + protected HoodieFileReader newParquetFileReader(Configuration conf, Path path) { + conf.setIfUnset(SQLConf.PARQUET_BINARY_AS_STRING().key(), + SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()); + conf.setIfUnset(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), + SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString()); + conf.setIfUnset(SQLConf.CASE_SENSITIVE().key(), SQLConf.CASE_SENSITIVE().defaultValueString()); + return new HoodieSparkParquetReader(conf, path); + } + + protected HoodieFileReader newHFileFileReader(Configuration conf, Path path) throws IOException { + throw new HoodieIOException("Not support read HFile"); + } + + protected HoodieFileReader newOrcFileReader(Configuration conf, Path path) { + throw new HoodieIOException("Not support read orc file"); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriter.java new file mode 100644 index 0000000000000..58ee01182c373 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriter.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.Schema; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.spark.sql.catalyst.CatalystTypeConverters; +import org.apache.spark.sql.catalyst.InternalRow; + +import java.io.IOException; +import java.util.Properties; + +import static org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField.COMMIT_SEQNO_METADATA_FIELD; +import static org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField.COMMIT_TIME_METADATA_FIELD; +import static org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField.FILENAME_METADATA_FIELD; +import static org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField.PARTITION_PATH_METADATA_FIELD; +import static org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField.RECORD_KEY_METADATA_FIELD; + +public interface HoodieSparkFileWriter extends HoodieFileWriter { + boolean canWrite(); + + void close() throws IOException; + + void writeRowWithMetadata(HoodieKey recordKey, InternalRow row) throws IOException; + + void writeRow(String recordKey, InternalRow row) throws IOException; + + @Override + default void write(String recordKey, HoodieRecord record, Schema schema, Properties props) throws IOException { + writeRow(recordKey, (InternalRow) record.getData()); + } + + @Override + default void writeWithMetadata(HoodieKey key, HoodieRecord record, Schema schema, Properties props) throws IOException { + writeRowWithMetadata(key, (InternalRow) record.getData()); + } + + default InternalRow prepRecordWithMetadata(HoodieKey key, InternalRow row, String instantTime, Integer partitionId, long recordIndex, String fileName) { + String seqId = HoodieRecord.generateSequenceId(instantTime, partitionId, recordIndex); + row.update(COMMIT_TIME_METADATA_FIELD.ordinal(), CatalystTypeConverters.convertToCatalyst(instantTime)); + row.update(COMMIT_SEQNO_METADATA_FIELD.ordinal(), CatalystTypeConverters.convertToCatalyst(seqId)); + row.update(RECORD_KEY_METADATA_FIELD.ordinal(), CatalystTypeConverters.convertToCatalyst(key.getRecordKey())); + row.update(PARTITION_PATH_METADATA_FIELD.ordinal(), CatalystTypeConverters.convertToCatalyst(key.getPartitionPath())); + row.update(FILENAME_METADATA_FIELD.ordinal(), CatalystTypeConverters.convertToCatalyst(fileName)); + return row; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java new file mode 100644 index 0000000000000..f746bb0e8e7bf --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hudi.HoodieInternalRowUtils; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.row.HoodieRowParquetConfig; +import org.apache.hudi.io.storage.row.HoodieRowParquetWriteSupport; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +import java.io.IOException; + +public class HoodieSparkFileWriterFactory extends HoodieFileWriterFactory { + + private static class SingletonHolder { + + private static final HoodieSparkFileWriterFactory INSTANCE = new HoodieSparkFileWriterFactory(); + } + + public static HoodieFileWriterFactory getFileWriterFactory() { + return HoodieSparkFileWriterFactory.SingletonHolder.INSTANCE; + } + + @Override + protected HoodieFileWriter newParquetFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + boolean populateMetaFields = config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS); + boolean enableBloomFilter = populateMetaFields; + Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); + String compressionCodecName = config.getStringOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME); + // Support PARQUET_COMPRESSION_CODEC_NAME is "" + if (compressionCodecName.isEmpty()) { + compressionCodecName = null; + } + HoodieRowParquetWriteSupport writeSupport = new HoodieRowParquetWriteSupport(conf, + HoodieInternalRowUtils.getCachedSchema(schema), filter, + HoodieStorageConfig.newBuilder().fromProperties(config.getProps()).build()); + HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport, + CompressionCodecName.fromConf(compressionCodecName), + config.getIntOrDefault(HoodieStorageConfig.PARQUET_BLOCK_SIZE), + config.getIntOrDefault(HoodieStorageConfig.PARQUET_PAGE_SIZE), + config.getLongOrDefault(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE), + conf, + config.getDoubleOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION), + config.getBooleanOrDefault(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED)); + parquetConfig.getHadoopConf().addResource(writeSupport.getHadoopConf()); + + return new HoodieSparkParquetWriter(path, parquetConfig, instantTime, taskContextSupplier, populateMetaFields); + } + + protected HoodieFileWriter newParquetFileWriter( + FSDataOutputStream outputStream, Configuration conf, HoodieConfig config, Schema schema) throws IOException { + boolean enableBloomFilter = false; + Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); + HoodieRowParquetWriteSupport writeSupport = new HoodieRowParquetWriteSupport(conf, + HoodieInternalRowUtils.getCachedSchema(schema), filter, + HoodieStorageConfig.newBuilder().fromProperties(config.getProps()).build()); + String compressionCodecName = config.getStringOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME); + // Support PARQUET_COMPRESSION_CODEC_NAME is "" + if (compressionCodecName.isEmpty()) { + compressionCodecName = null; + } + HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport, + CompressionCodecName.fromConf(compressionCodecName), + config.getInt(HoodieStorageConfig.PARQUET_BLOCK_SIZE), + config.getInt(HoodieStorageConfig.PARQUET_PAGE_SIZE), + config.getLong(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE), + writeSupport.getHadoopConf(), config.getDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION), + config.getBooleanOrDefault(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED)); + parquetConfig.getHadoopConf().addResource(writeSupport.getHadoopConf()); + return new HoodieSparkParquetStreamWriter(outputStream, parquetConfig); + } + + @Override + protected HoodieFileWriter newHFileFileWriter(String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + throw new HoodieIOException("Not support write to HFile"); + } + + @Override + protected HoodieFileWriter newOrcFileWriter(String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + throw new HoodieIOException("Not support write to Orc file"); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java new file mode 100644 index 0000000000000..7078393d281d2 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.HoodieInternalRowUtils; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.ParquetReaderIterator; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport; +import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.sql.types.StructType; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +public class HoodieSparkParquetReader implements HoodieSparkFileReader { + + private final Path path; + private final Configuration conf; + private final BaseFileUtils parquetUtils; + private List readerIterators = new ArrayList<>(); + + public HoodieSparkParquetReader(Configuration conf, Path path) { + this.path = path; + this.conf = conf; + this.parquetUtils = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET); + } + + @Override + public String[] readMinMaxRecordKeys() { + return parquetUtils.readMinMaxRecordKeys(conf, path); + } + + @Override + public BloomFilter readBloomFilter() { + return parquetUtils.readBloomFilterFromMetadata(conf, path); + } + + @Override + public Set filterRowKeys(Set candidateRowKeys) { + return parquetUtils.filterRowKeys(conf, path, candidateRowKeys); + } + + @Override + public ClosableIterator getInternalRowIterator(Schema schema) throws IOException { + return getInternalRowIterator(schema, null); + } + + @Override + public ClosableIterator getInternalRowIterator(Schema readerSchema, Schema requestedSchema) throws IOException { + if (requestedSchema == null) { + requestedSchema = readerSchema; + } + StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(readerSchema); + StructType requestedStructType = HoodieInternalRowUtils.getCachedSchema(requestedSchema); + conf.set(ParquetReadSupport.PARQUET_READ_SCHEMA, readerStructType.json()); + conf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA(), requestedStructType.json()); + conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING().key(), (Boolean) SQLConf.get().getConf(SQLConf.PARQUET_BINARY_AS_STRING())); + conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), (Boolean) SQLConf.get().getConf(SQLConf.PARQUET_INT96_AS_TIMESTAMP())); + InputFile inputFile = HadoopInputFile.fromPath(path, conf); + ParquetReader reader = new ParquetReader.Builder(inputFile) { + @Override + protected ReadSupport getReadSupport() { + return new ParquetReadSupport(); + } + }.withConf(conf).build(); + ParquetReaderIterator parquetReaderIterator = new ParquetReaderIterator<>(reader, InternalRow::copy); + readerIterators.add(parquetReaderIterator); + return parquetReaderIterator; + } + + @Override + public Schema getSchema() { + return parquetUtils.readAvroSchema(conf, path); + } + + @Override + public void close() { + readerIterators.forEach(ParquetReaderIterator::close); + } + + @Override + public long getTotalRecords() { + return parquetUtils.getRowCount(conf, path); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetStreamWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetStreamWriter.java new file mode 100644 index 0000000000000..5c6f85af290d0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetStreamWriter.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.io.storage.row.HoodieRowParquetConfig; +import org.apache.hudi.io.storage.row.HoodieRowParquetWriteSupport; +import org.apache.hudi.parquet.io.OutputStreamBackedOutputFile; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.api.WriteSupport; +import org.apache.parquet.io.OutputFile; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.unsafe.types.UTF8String; + +public class HoodieSparkParquetStreamWriter implements HoodieSparkFileWriter, AutoCloseable { + private final ParquetWriter writer; + private final HoodieRowParquetWriteSupport writeSupport; + + public HoodieSparkParquetStreamWriter(FSDataOutputStream outputStream, + HoodieRowParquetConfig parquetConfig) throws IOException { + this.writeSupport = parquetConfig.getWriteSupport(); + this.writer = new Builder<>(new OutputStreamBackedOutputFile(outputStream), writeSupport) + .withWriteMode(ParquetFileWriter.Mode.CREATE) + .withCompressionCodec(parquetConfig.getCompressionCodecName()) + .withRowGroupSize(parquetConfig.getBlockSize()) + .withPageSize(parquetConfig.getPageSize()) + .withDictionaryPageSize(parquetConfig.getPageSize()) + .withDictionaryEncoding(parquetConfig.dictionaryEnabled()) + .withWriterVersion(ParquetWriter.DEFAULT_WRITER_VERSION) + .withConf(parquetConfig.getHadoopConf()) + .build(); + } + + @Override + public boolean canWrite() { + return true; + } + + @Override + public void writeRow(String key, InternalRow record) throws IOException { + writer.write(record); + writeSupport.add(UTF8String.fromString(key)); + } + + @Override + public void writeRowWithMetadata(HoodieKey key, InternalRow record) throws IOException { + // TODO support populating the metadata + this.writeRow(key.getRecordKey(), record); + } + + @Override + public void close() throws IOException { + writer.close(); + } + + private static class Builder extends ParquetWriter.Builder> { + private final WriteSupport writeSupport; + + private Builder(Path file, WriteSupport writeSupport) { + super(file); + this.writeSupport = writeSupport; + } + + private Builder(OutputFile file, WriteSupport writeSupport) { + super(file); + this.writeSupport = writeSupport; + } + + @Override + protected Builder self() { + return this; + } + + @Override + protected WriteSupport getWriteSupport(Configuration conf) { + return writeSupport; + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetWriter.java new file mode 100644 index 0000000000000..3b4a86502d280 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetWriter.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.io.storage.row.HoodieRowParquetConfig; +import org.apache.hudi.io.storage.row.HoodieRowParquetWriteSupport; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.unsafe.types.UTF8String; + +import java.io.IOException; + +public class HoodieSparkParquetWriter extends HoodieBaseParquetWriter implements HoodieSparkFileWriter { + + // TODO: better code reuse + private final String fileName; + private final String instantTime; + private final TaskContextSupplier taskContextSupplier; + private final boolean populateMetaFields; + private final HoodieRowParquetWriteSupport writeSupport; + + public HoodieSparkParquetWriter(Path file, + HoodieRowParquetConfig parquetConfig, + String instantTime, + TaskContextSupplier taskContextSupplier, + boolean populateMetaFields) throws IOException { + super(file, parquetConfig); + this.writeSupport = parquetConfig.getWriteSupport(); + this.fileName = file.getName(); + this.instantTime = instantTime; + this.taskContextSupplier = taskContextSupplier; + this.populateMetaFields = populateMetaFields; + } + + @Override + public void writeRowWithMetadata(HoodieKey key, InternalRow row) throws IOException { + if (populateMetaFields) { + prepRecordWithMetadata(key, row, instantTime, + taskContextSupplier.getPartitionIdSupplier().get(), getWrittenRecordCount(), fileName); + super.write(row); + writeSupport.add(UTF8String.fromString(key.getRecordKey())); + } else { + super.write(row); + } + } + + @Override + public void writeRow(String recordKey, InternalRow row) throws IOException { + super.write(row); + if (populateMetaFields) { + writeSupport.add(UTF8String.fromString(recordKey)); + } + } + + @Override + public void close() throws IOException { + super.close(); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java index e68873f92efdb..8a61c7c44d900 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java @@ -67,7 +67,7 @@ private static HoodieInternalRowFileWriter newParquetInternalRowFileWriter(Path ) throws IOException { HoodieRowParquetWriteSupport writeSupport = - new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, bloomFilterOpt, writeConfig); + new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, bloomFilterOpt, writeConfig.getStorageConfig()); return new HoodieInternalRowParquetWriter( path, diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetConfig.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetConfig.java new file mode 100644 index 0000000000000..8a4a60b52fd38 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetConfig.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.io.storage.HoodieParquetConfig; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +/** + * ParquetConfig for datasource implementation with {@link org.apache.hudi.client.model.HoodieInternalRow}. + */ +public class HoodieRowParquetConfig extends HoodieParquetConfig { + + public HoodieRowParquetConfig(HoodieRowParquetWriteSupport writeSupport, CompressionCodecName compressionCodecName, + int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, + double compressionRatio) { + super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio); + } + + public HoodieRowParquetConfig(HoodieRowParquetWriteSupport writeSupport, CompressionCodecName compressionCodecName, + int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, + double compressionRatio, boolean enableDictionary) { + super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, enableDictionary); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java index 28964ecc3f00c..b437501f42ce4 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java @@ -21,8 +21,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; import org.apache.spark.sql.types.StructType; @@ -34,6 +34,7 @@ import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE; import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER; import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER; +import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_FIELD_ID_WRITE_ENABLED; /** * Hoodie Write Support for directly writing Row to Parquet. @@ -46,11 +47,11 @@ public class HoodieRowParquetWriteSupport extends ParquetWriteSupport { private UTF8String minRecordKey; private UTF8String maxRecordKey; - public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, Option bloomFilterOpt, HoodieWriteConfig writeConfig) { + public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, Option bloomFilterOpt, HoodieStorageConfig config) { Configuration hadoopConf = new Configuration(conf); - hadoopConf.set("spark.sql.parquet.writeLegacyFormat", writeConfig.parquetWriteLegacyFormatEnabled()); - hadoopConf.set("spark.sql.parquet.outputTimestampType", writeConfig.parquetOutputTimestampType()); - hadoopConf.set("spark.sql.parquet.fieldId.write.enabled", writeConfig.parquetFieldIdWriteEnabled()); + hadoopConf.set("spark.sql.parquet.writeLegacyFormat", config.getString(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED)); + hadoopConf.set("spark.sql.parquet.outputTimestampType", config.getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE)); + hadoopConf.set("spark.sql.parquet.fieldId.write.enabled", config.getString(PARQUET_FIELD_ID_WRITE_ENABLED)); this.hadoopConf = hadoopConf; setSchema(structType, hadoopConf); this.bloomFilter = bloomFilterOpt.orElse(null); @@ -77,6 +78,9 @@ public WriteSupport.FinalizedWriteContext finalizeWrite() { } public void add(UTF8String recordKey) { + if (this.bloomFilter == null) { + return; + } this.bloomFilter.add(recordKey.getBytes()); if (minRecordKey == null || minRecordKey.compareTo(recordKey) < 0) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java index ad71b17ce70ff..60f3d4899d09d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java @@ -278,7 +278,7 @@ private S combinePartitionPathInternal(Supplier> builderFac // Avoid creating [[StringBuilder]] in case there's just one partition-path part, // and Hive-style of partitioning is not required if (!hiveStylePartitioning && partitionPathParts.length == 1) { - return emptyHandler.apply(converter.apply(partitionPathParts[0])); + return encoder.apply(emptyHandler.apply(converter.apply(partitionPathParts[0]))); } StringBuilder sb = builderFactory.get(); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java index 96ac794dcbc82..d7a4a2b52abf7 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.OrcReaderIterator; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; @@ -63,6 +64,10 @@ Schema getAvroSchema(Path sourceFilePath) throws IOException { @Override void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception { + // TODO support spark orc reader + if (config.getRecordMerger().getRecordType() == HoodieRecordType.SPARK) { + throw new UnsupportedOperationException(); + } BoundedInMemoryExecutor wrapper = null; Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf())); TypeDescription orcSchema = orcReader.getSchema(); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java index 5f45629ba8023..b47a1c765dd4a 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java @@ -20,34 +20,32 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; -import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; -import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.ParquetReaderIterator; +import org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.keygen.KeyGeneratorInterface; import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; import java.io.IOException; +import java.util.Collections; +import java.util.Properties; class ParquetBootstrapMetadataHandler extends BaseBootstrapMetadataHandler { private static final Logger LOG = LogManager.getLogger(ParquetBootstrapMetadataHandler.class); @@ -67,18 +65,22 @@ Schema getAvroSchema(Path sourceFilePath) throws IOException { @Override void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception { - BoundedInMemoryExecutor wrapper = null; - ParquetReader reader = - AvroParquetReader.builder(sourceFilePath).withConf(table.getHadoopConf()).build(); + BoundedInMemoryExecutor wrapper = null; + HoodieFileReader reader = HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType()) + .getFileReader(table.getHadoopConf(), sourceFilePath); try { - wrapper = new BoundedInMemoryExecutor(config.getWriteBufferLimitBytes(), - new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> { - String recKey = keyGenerator.getKey(inp).getRecordKey(); - GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); - gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); - BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); - HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload); - return rec; + wrapper = new BoundedInMemoryExecutor(config.getWriteBufferLimitBytes(), + reader.getRecordIterator(), new BootstrapRecordConsumer(bootstrapHandle), inp -> { + try { + String recKey = inp.getRecordKey(Option.of(keyGenerator)); + HoodieRecord hoodieRecord = inp.rewriteRecord(reader.getSchema(), config.getProps(), HoodieAvroUtils.RECORD_KEY_SCHEMA); + return hoodieRecord + .updateValues(HoodieAvroUtils.RECORD_KEY_SCHEMA, new Properties(), Collections.singletonMap(HoodieMetadataField.RECORD_KEY_METADATA_FIELD.getFieldName(), recKey)) + .newInstance(new HoodieKey(recKey, partitionPath)); + } catch (IOException e) { + LOG.error("Unable to overrideMetadataFieldValue", e); + return null; + } }, table.getPreExecuteRunnable()); wrapper.execute(); } catch (Exception e) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java index 56e8f81405f43..e3524f4d70959 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java @@ -279,7 +279,7 @@ protected Option>> fullBootstrap(Lis properties, context); JavaRDD inputRecordsRDD = (JavaRDD) inputProvider.generateInputRecords("bootstrap_source", config.getBootstrapSourceBasePath(), - partitionFilesList); + partitionFilesList, config); // Start Full Bootstrap String bootstrapInstantTime = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS; final HoodieInstant requested = new HoodieInstant( diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/HoodieSparkRecordUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/HoodieSparkRecordUtils.java new file mode 100644 index 0000000000000..4779e8e05f340 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/HoodieSparkRecordUtils.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.HoodieInternalRowUtils; +import org.apache.hudi.commmon.model.HoodieSparkRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.spark.sql.HoodieCatalystExpressionUtils$; +import org.apache.spark.sql.HoodieUnsafeRowUtils; +import org.apache.spark.sql.HoodieUnsafeRowUtils.NestedFieldPath; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; + +public class HoodieSparkRecordUtils { + + /** + * Utility method to convert InternalRow to HoodieRecord using schema and payload class. + */ + public static HoodieRecord convertToHoodieSparkRecord(StructType structType, InternalRow data, boolean withOperationField) { + return convertToHoodieSparkRecord(structType, data, + Pair.of(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.PARTITION_PATH_METADATA_FIELD), + withOperationField, Option.empty()); + } + + public static HoodieRecord convertToHoodieSparkRecord(StructType structType, InternalRow data, boolean withOperationField, + Option partitionName) { + return convertToHoodieSparkRecord(structType, data, + Pair.of(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.PARTITION_PATH_METADATA_FIELD), + withOperationField, partitionName); + } + + /** + * Utility method to convert bytes to HoodieRecord using schema and payload class. + */ + public static HoodieRecord convertToHoodieSparkRecord(StructType structType, InternalRow data, Pair recordKeyPartitionPathFieldPair, + boolean withOperationField, Option partitionName) { + final String recKey = getValue(structType, recordKeyPartitionPathFieldPair.getKey(), data).toString(); + final String partitionPath = (partitionName.isPresent() ? partitionName.get() : + getValue(structType, recordKeyPartitionPathFieldPair.getRight(), data).toString()); + + HoodieOperation operation = withOperationField + ? HoodieOperation.fromName(getNullableValAsString(structType, data, HoodieRecord.OPERATION_METADATA_FIELD)) : null; + return new HoodieSparkRecord(new HoodieKey(recKey, partitionPath), data, structType, operation); + } + + private static Object getValue(StructType structType, String fieldName, InternalRow row) { + NestedFieldPath posList = HoodieInternalRowUtils.getCachedPosList(structType, fieldName); + return HoodieUnsafeRowUtils.getNestedInternalRowValue(row, posList); + } + + /** + * Returns the string value of the given record {@code rec} and field {@code fieldName}. The field and value both could be missing. + * + * @param row The record + * @param fieldName The field name + * @return the string form of the field or empty if the schema does not contain the field name or the value is null + */ + private static Option getNullableValAsString(StructType structType, InternalRow row, String fieldName) { + String fieldVal = !HoodieCatalystExpressionUtils$.MODULE$.existField(structType, fieldName) + ? null : StringUtils.objToString(getValue(structType, fieldName, row)); + return Option.ofNullable(fieldVal); + } + + /** + * Gets record column values into one object. + * + * @param row InternalRow. + * @param columns Names of the columns to get values. + * @param structType {@link StructType} instance. + * @return Column value if a single column, or concatenated String values by comma. + */ + public static Object getRecordColumnValues(InternalRow row, + String[] columns, + StructType structType, boolean consistentLogicalTimestampEnabled) { + if (columns.length == 1) { + NestedFieldPath posList = HoodieInternalRowUtils.getCachedPosList(structType, columns[0]); + return HoodieUnsafeRowUtils.getNestedInternalRowValue(row, posList); + } else { + // TODO this is inefficient, instead we can simply return array of Comparable + StringBuilder sb = new StringBuilder(); + for (String col : columns) { + // TODO support consistentLogicalTimestampEnabled + NestedFieldPath posList = HoodieInternalRowUtils.getCachedPosList(structType, columns[0]); + return HoodieUnsafeRowUtils.getNestedInternalRowValue(row, posList); + } + return sb.toString(); + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieInternalRowUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieInternalRowUtils.scala similarity index 51% rename from hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieInternalRowUtils.scala rename to hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieInternalRowUtils.scala index 54e4010fe13ba..cd259974d9b28 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieInternalRowUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieInternalRowUtils.scala @@ -16,60 +16,74 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.hudi import java.nio.charset.StandardCharsets -import java.util +import java.util.HashMap import java.util.concurrent.ConcurrentHashMap -import org.apache.avro.Schema -import org.apache.hudi.AvroConversionUtils -import org.apache.hudi.avro.HoodieAvroUtils -import org.apache.hudi.avro.HoodieAvroUtils.{createFullName, fromJavaDate, toJavaDate} +import org.apache.avro.{Schema, SchemaNormalization} +import org.apache.hbase.thirdparty.com.google.common.base.Supplier +import org.apache.hudi.avro.HoodieAvroUtils.{createFullName, toJavaDate} import org.apache.hudi.common.model.HoodieRecord.HoodieMetadataField -import org.apache.hudi.common.util.ValidationUtils import org.apache.hudi.exception.HoodieException import org.apache.spark.sql.HoodieCatalystExpressionUtils -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, JoinedRow, MutableProjection, Projection} +import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, JoinedRow, Projection} +import org.apache.spark.sql.HoodieUnsafeRowUtils.NestedFieldPath +import org.apache.spark.sql.HoodieUnsafeRowUtils +import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, GenericArrayData, MapData} +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.types._ import scala.collection.mutable -/** - * Helper class to do common stuff across Spark InternalRow. - * Provides common methods similar to {@link HoodieAvroUtils}. - */ object HoodieInternalRowUtils { - val projectionMap = new ConcurrentHashMap[(StructType, StructType), MutableProjection] + // Projection are all thread local. Projection is not thread-safe + val unsafeProjectionThreadLocal: ThreadLocal[HashMap[(StructType, StructType), UnsafeProjection]] = + ThreadLocal.withInitial(new Supplier[HashMap[(StructType, StructType), UnsafeProjection]] { + override def get(): HashMap[(StructType, StructType), UnsafeProjection] = new HashMap[(StructType, StructType), UnsafeProjection] + }) + val unsafeConvertThreadLocal: ThreadLocal[HashMap[StructType, UnsafeProjection]] = + ThreadLocal.withInitial(new Supplier[HashMap[StructType, UnsafeProjection]] { + override def get(): HashMap[StructType, UnsafeProjection] = new HashMap[StructType, UnsafeProjection] + }) val schemaMap = new ConcurrentHashMap[Schema, StructType] - val SchemaPosMap = new ConcurrentHashMap[StructType, Map[String, (StructField, Int)]] + val schemaFingerPrintMap = new ConcurrentHashMap[Long, StructType] + val fingerPrintSchemaMap = new ConcurrentHashMap[StructType, Long] + val orderPosListMap = new ConcurrentHashMap[(StructType, String), NestedFieldPath] + /** + * @see org.apache.hudi.avro.HoodieAvroUtils#stitchRecords(org.apache.avro.generic.GenericRecord, org.apache.avro.generic.GenericRecord, org.apache.avro.Schema) + */ def stitchRecords(left: InternalRow, leftSchema: StructType, right: InternalRow, rightSchema: StructType, stitchedSchema: StructType): InternalRow = { val mergeSchema = StructType(leftSchema.fields ++ rightSchema.fields) val row = new JoinedRow(left, right) - val projection = getCachedProjection(mergeSchema, stitchedSchema) + val projection = getCachedUnsafeProjection(mergeSchema, stitchedSchema) projection(row) } + /** + * @see org.apache.hudi.avro.HoodieAvroUtils#rewriteRecord(org.apache.avro.generic.GenericRecord, org.apache.avro.Schema) + */ def rewriteRecord(oldRecord: InternalRow, oldSchema: StructType, newSchema: StructType): InternalRow = { val newRow = new GenericInternalRow(Array.fill(newSchema.fields.length)(null).asInstanceOf[Array[Any]]) - val oldFieldMap = getCachedSchemaPosMap(oldSchema) for ((field, pos) <- newSchema.fields.zipWithIndex) { var oldValue: AnyRef = null - if (oldFieldMap.contains(field.name)) { - val (oldField, oldPos) = oldFieldMap(field.name) + if (HoodieCatalystExpressionUtils.existField(oldSchema, field.name)) { + val oldField = oldSchema(field.name) + val oldPos = oldSchema.fieldIndex(field.name) oldValue = oldRecord.get(oldPos, oldField.dataType) } if (oldValue != null) { field.dataType match { case structType: StructType => - val oldField = oldFieldMap(field.name)._1.asInstanceOf[StructType] - rewriteRecord(oldValue.asInstanceOf[InternalRow], oldField, structType) + val oldType = oldSchema(field.name).dataType.asInstanceOf[StructType] + val newValue = rewriteRecord(oldValue.asInstanceOf[InternalRow], oldType, structType) + newRow.update(pos, newValue) case decimalType: DecimalType => - val oldField = oldFieldMap(field.name)._1.asInstanceOf[DecimalType] - if (decimalType.scale != oldField.scale || decimalType.precision != oldField.precision) { + val oldFieldSchema = oldSchema(field.name).dataType.asInstanceOf[DecimalType] + if (decimalType.scale != oldFieldSchema.scale || decimalType.precision != oldFieldSchema.precision) { newRow.update(pos, Decimal.fromDecimal(oldValue.asInstanceOf[Decimal].toBigDecimal.setScale(newSchema.asInstanceOf[DecimalType].scale)) ) } else { @@ -86,34 +100,44 @@ object HoodieInternalRowUtils { newRow } - def rewriteRecordWithNewSchema(oldRecord: InternalRow, oldSchema: StructType, newSchema: StructType, renameCols: util.Map[String, String]): InternalRow = { - rewriteRecordWithNewSchema(oldRecord, oldSchema, newSchema, renameCols, new util.LinkedList[String]).asInstanceOf[InternalRow] + /** + * @see org.apache.hudi.avro.HoodieAvroUtils#rewriteRecordWithNewSchema(org.apache.avro.generic.IndexedRecord, org.apache.avro.Schema, java.util.Map) + */ + def rewriteRecordWithNewSchema(oldRecord: InternalRow, oldSchema: StructType, newSchema: StructType, renameCols: java.util.Map[String, String]): InternalRow = { + rewriteRecordWithNewSchema(oldRecord, oldSchema, newSchema, renameCols, new java.util.LinkedList[String]).asInstanceOf[InternalRow] } - private def rewriteRecordWithNewSchema(oldRecord: Any, oldSchema: DataType, newSchema: DataType, renameCols: util.Map[String, String], fieldNames: util.Deque[String]): Any = { + /** + * @see org.apache.hudi.avro.HoodieAvroUtils#rewriteRecordWithNewSchema(java.lang.Object, org.apache.avro.Schema, org.apache.avro.Schema, java.util.Map, java.util.Deque) + */ + private def rewriteRecordWithNewSchema(oldRecord: Any, oldSchema: DataType, newSchema: DataType, renameCols: java.util.Map[String, String], fieldNames: java.util.Deque[String]): Any = { if (oldRecord == null) { null } else { newSchema match { case targetSchema: StructType => - ValidationUtils.checkArgument(oldRecord.isInstanceOf[InternalRow], "cannot rewrite record with different type") + if (!oldRecord.isInstanceOf[InternalRow]) { + throw new IllegalArgumentException("cannot rewrite record with different type") + } val oldRow = oldRecord.asInstanceOf[InternalRow] val helper = mutable.Map[Integer, Any]() - val oldSchemaPos = getCachedSchemaPosMap(oldSchema.asInstanceOf[StructType]) + val oldStrucType = oldSchema.asInstanceOf[StructType] targetSchema.fields.zipWithIndex.foreach { case (field, i) => fieldNames.push(field.name) - if (oldSchemaPos.contains(field.name)) { - val (oldField, oldPos) = oldSchemaPos(field.name) + if (HoodieCatalystExpressionUtils.existField(oldStrucType, field.name)) { + val oldField = oldStrucType(field.name) + val oldPos = oldStrucType.fieldIndex(field.name) helper(i) = rewriteRecordWithNewSchema(oldRow.get(oldPos, oldField.dataType), oldField.dataType, field.dataType, renameCols, fieldNames) } else { val fieldFullName = createFullName(fieldNames) val colNamePartsFromOldSchema = renameCols.getOrDefault(fieldFullName, "").split("\\.") val lastColNameFromOldSchema = colNamePartsFromOldSchema(colNamePartsFromOldSchema.length - 1) // deal with rename - if (!oldSchemaPos.contains(field.name) && oldSchemaPos.contains(lastColNameFromOldSchema)) { + if (!HoodieCatalystExpressionUtils.existField(oldStrucType, field.name) && HoodieCatalystExpressionUtils.existField(oldStrucType, lastColNameFromOldSchema)) { // find rename - val (oldField, oldPos) = oldSchemaPos(lastColNameFromOldSchema) + val oldField = oldStrucType(lastColNameFromOldSchema) + val oldPos = oldStrucType.fieldIndex(lastColNameFromOldSchema) helper(i) = rewriteRecordWithNewSchema(oldRow.get(oldPos, oldField.dataType), oldField.dataType, field.dataType, renameCols, fieldNames) } } @@ -131,7 +155,9 @@ object HoodieInternalRowUtils { newRow case targetSchema: ArrayType => - ValidationUtils.checkArgument(oldRecord.isInstanceOf[ArrayData], "cannot rewrite record with different type") + if (!oldRecord.isInstanceOf[ArrayData]) { + throw new IllegalArgumentException("cannot rewrite record with different type") + } val oldElementType = oldSchema.asInstanceOf[ArrayType].elementType val oldArray = oldRecord.asInstanceOf[ArrayData] val newElementType = targetSchema.elementType @@ -142,7 +168,9 @@ object HoodieInternalRowUtils { newArray case targetSchema: MapType => - ValidationUtils.checkArgument(oldRecord.isInstanceOf[MapData], "cannot rewrite record with different type") + if (!oldRecord.isInstanceOf[MapData]) { + throw new IllegalArgumentException("cannot rewrite record with different type") + } val oldValueType = oldSchema.asInstanceOf[MapType].valueType val oldKeyType = oldSchema.asInstanceOf[MapType].keyType val oldMap = oldRecord.asInstanceOf[MapData] @@ -161,55 +189,86 @@ object HoodieInternalRowUtils { } } + /** + * @see org.apache.hudi.avro.HoodieAvroUtils#rewriteRecordWithMetadata(org.apache.avro.generic.GenericRecord, org.apache.avro.Schema, java.lang.String) + */ def rewriteRecordWithMetadata(record: InternalRow, oldSchema: StructType, newSchema: StructType, fileName: String): InternalRow = { val newRecord = rewriteRecord(record, oldSchema, newSchema) - newRecord.update(HoodieMetadataField.FILENAME_METADATA_FIELD.ordinal, fileName) + newRecord.update(HoodieMetadataField.FILENAME_METADATA_FIELD.ordinal, CatalystTypeConverters.convertToCatalyst(fileName)) newRecord } + /** + * @see org.apache.hudi.avro.HoodieAvroUtils#rewriteEvolutionRecordWithMetadata(org.apache.avro.generic.GenericRecord, org.apache.avro.Schema, java.lang.String) + */ def rewriteEvolutionRecordWithMetadata(record: InternalRow, oldSchema: StructType, newSchema: StructType, fileName: String): InternalRow = { - val newRecord = rewriteRecordWithNewSchema(record, oldSchema, newSchema, new util.HashMap[String, String]()) - newRecord.update(HoodieMetadataField.FILENAME_METADATA_FIELD.ordinal, fileName) + val newRecord = rewriteRecordWithNewSchema(record, oldSchema, newSchema, new java.util.HashMap[String, String]()) + newRecord.update(HoodieMetadataField.FILENAME_METADATA_FIELD.ordinal, CatalystTypeConverters.convertToCatalyst(fileName)) newRecord } + def getCachedPosList(structType: StructType, field: String): NestedFieldPath = { + val schemaPair = (structType, field) + if (!orderPosListMap.containsKey(schemaPair)) { + val posList = HoodieUnsafeRowUtils.composeNestedFieldPath(structType, field) + orderPosListMap.put(schemaPair, posList) + } + orderPosListMap.get(schemaPair) + } + + def getCachedUnsafeConvert(structType: StructType): UnsafeProjection = { + val map = unsafeConvertThreadLocal.get() + if (!map.containsKey(structType)) { + val projection = UnsafeProjection.create(structType) + map.put(structType, projection) + } + map.get(structType) + } + + def getCachedUnsafeProjection(from: StructType, to: StructType): Projection = { + val schemaPair = (from, to) + val map = unsafeProjectionThreadLocal.get() + if (!map.containsKey(schemaPair)) { + val projection = HoodieCatalystExpressionUtils.generateUnsafeProjection(from, to) + map.put(schemaPair, projection) + } + map.get(schemaPair) + } + def getCachedSchema(schema: Schema): StructType = { - if (!schemaMap.contains(schema)) { - schemaMap.synchronized { - if (!schemaMap.contains(schema)) { - val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) - schemaMap.put(schema, structType) - } - } + if (!schemaMap.containsKey(schema)) { + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + schemaMap.put(schema, structType) } schemaMap.get(schema) } - private def getCachedProjection(from: StructType, to: StructType): Projection = { - val schemaPair = (from, to) - if (!projectionMap.contains(schemaPair)) { - projectionMap.synchronized { - if (!projectionMap.contains(schemaPair)) { - val projection = HoodieCatalystExpressionUtils.generateMutableProjection(from, to) - projectionMap.put(schemaPair, projection) - } - } + def getCachedSchemaFromFingerPrint(fingerPrint: Long): StructType = { + if (!schemaFingerPrintMap.containsKey(fingerPrint)) { + throw new IllegalArgumentException("Not exist " + fingerPrint) } - projectionMap.get(schemaPair) + schemaFingerPrintMap.get(fingerPrint) } - def getCachedSchemaPosMap(schema: StructType): Map[String, (StructField, Int)] = { - if (!SchemaPosMap.contains(schema)) { - SchemaPosMap.synchronized { - if (!SchemaPosMap.contains(schema)) { - val fieldMap = schema.fields.zipWithIndex.map { case (field, i) => (field.name, (field, i)) }.toMap - SchemaPosMap.put(schema, fieldMap) - } - } + def getCachedFingerPrintFromSchema(schema: StructType): Long = { + if (!fingerPrintSchemaMap.containsKey(schema)) { + throw new IllegalArgumentException("Not exist " + schema) } - SchemaPosMap.get(schema) + fingerPrintSchemaMap.get(schema) + } + + def addCompressedSchema(schema: StructType): Unit ={ + if (!fingerPrintSchemaMap.containsKey(schema)) { + val fingerPrint = SchemaNormalization.fingerprint64(schema.json.getBytes(StandardCharsets.UTF_8)) + schemaFingerPrintMap.put(fingerPrint, schema) + fingerPrintSchemaMap.put(schema, fingerPrint) + } + } + + def containsCompressedSchema(schema: StructType): Boolean = { + fingerPrintSchemaMap.containsKey(schema) } private def rewritePrimaryType(oldValue: Any, oldSchema: DataType, newSchema: DataType): Any = { @@ -231,35 +290,35 @@ object HoodieInternalRowUtils { val value = newSchema match { case NullType | BooleanType => case DateType if oldSchema.equals(StringType) => - fromJavaDate(java.sql.Date.valueOf(oldValue.toString)) + CatalystTypeConverters.convertToCatalyst(java.sql.Date.valueOf(oldValue.toString)) case LongType => oldSchema match { - case IntegerType => oldValue.asInstanceOf[Int].longValue() + case IntegerType => CatalystTypeConverters.convertToCatalyst(oldValue.asInstanceOf[Int].longValue()) case _ => } case FloatType => oldSchema match { - case IntegerType => oldValue.asInstanceOf[Int].floatValue() - case LongType => oldValue.asInstanceOf[Long].floatValue() + case IntegerType => CatalystTypeConverters.convertToCatalyst(oldValue.asInstanceOf[Int].floatValue()) + case LongType => CatalystTypeConverters.convertToCatalyst(oldValue.asInstanceOf[Long].floatValue()) case _ => } case DoubleType => oldSchema match { - case IntegerType => oldValue.asInstanceOf[Int].doubleValue() - case LongType => oldValue.asInstanceOf[Long].doubleValue() - case FloatType => java.lang.Double.valueOf(oldValue.asInstanceOf[Float] + "") + case IntegerType => CatalystTypeConverters.convertToCatalyst(oldValue.asInstanceOf[Int].doubleValue()) + case LongType => CatalystTypeConverters.convertToCatalyst(oldValue.asInstanceOf[Long].doubleValue()) + case FloatType => CatalystTypeConverters.convertToCatalyst(java.lang.Double.valueOf(oldValue.asInstanceOf[Float] + "")) case _ => } case BinaryType => oldSchema match { - case StringType => oldValue.asInstanceOf[String].getBytes(StandardCharsets.UTF_8) + case StringType => CatalystTypeConverters.convertToCatalyst(oldValue.asInstanceOf[String].getBytes(StandardCharsets.UTF_8)) case _ => } case StringType => oldSchema match { - case BinaryType => new String(oldValue.asInstanceOf[Array[Byte]]) - case DateType => toJavaDate(oldValue.asInstanceOf[Integer]).toString - case IntegerType | LongType | FloatType | DoubleType | DecimalType() => oldValue.toString + case BinaryType => CatalystTypeConverters.convertToCatalyst(new String(oldValue.asInstanceOf[Array[Byte]])) + case DateType => CatalystTypeConverters.convertToCatalyst(toJavaDate(oldValue.asInstanceOf[Integer]).toString) + case IntegerType | LongType | FloatType | DoubleType | DecimalType() => CatalystTypeConverters.convertToCatalyst(oldValue.toString) case _ => } case DecimalType() => @@ -275,7 +334,11 @@ object HoodieInternalRowUtils { if (value == None) { throw new HoodieException(String.format("cannot support rewrite value for schema type: %s since the old schema type is: %s", newSchema, oldSchema)) } else { - value + CatalystTypeConverters.convertToCatalyst(value) } } + + def removeFields(schema: StructType, fieldsToRemove: java.util.List[String]): StructType = { + StructType(schema.fields.filter(field => !fieldsToRemove.contains(field.name))) + } } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkConversionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkConversionUtils.scala new file mode 100644 index 0000000000000..799bda2f8e5ba --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkConversionUtils.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.model.HoodieRecord +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrameUtil, Dataset, Row, SparkSession} + +object SparkConversionUtils { + + def createDataFrame[T](rdd: RDD[HoodieRecord[T]], ss: SparkSession, structType: StructType): Dataset[Row] = { + if (rdd.isEmpty()) { + ss.emptyDataFrame + } else { + DataFrameUtil.createFromInternalRows(ss, structType, rdd.map(_.getData.asInstanceOf[InternalRow])) + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/DataFrameUtil.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/DataFrameUtil.scala new file mode 100644 index 0000000000000..f05da8a0ce85a --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/DataFrameUtil.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.LogicalRDD +import org.apache.spark.sql.types.StructType + +object DataFrameUtil { + + /** + * Creates a DataFrame out of RDD[InternalRow] that you can get + * using `df.queryExection.toRdd` + */ + def createFromInternalRows(sparkSession: SparkSession, schema: + StructType, rdd: RDD[InternalRow]): DataFrame = { + val logicalPlan = LogicalRDD(schema.toAttributes, rdd)(sparkSession) + Dataset.ofRows(sparkSession, logicalPlan) + } +} \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala index c047730a320ed..86f10d08064ec 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala @@ -226,6 +226,10 @@ object HoodieCatalystExpressionUtils { ) } + def existField(structType: StructType, name: String): Boolean = { + structType.getFieldIndex(name).isDefined + } + private def hasUnresolvedRefs(resolvedExpr: Expression): Boolean = resolvedExpr.collectFirst { case _: UnresolvedAttribute | _: UnresolvedFunction => true diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkStructTypeSerializer.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkStructTypeSerializer.scala new file mode 100644 index 0000000000000..e3407d19b65f9 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkStructTypeSerializer.scala @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import com.esotericsoftware.kryo.Kryo +import com.esotericsoftware.kryo.io.{Input, Output} +import com.twitter.chill.KSerializer +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} +import java.nio.ByteBuffer +import java.nio.charset.StandardCharsets +import org.apache.avro.SchemaNormalization +import org.apache.commons.io.IOUtils +import org.apache.hudi.commmon.model.HoodieSparkRecord +import org.apache.spark.io.CompressionCodec +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.Utils +import org.apache.spark.{SparkEnv, SparkException} +import scala.collection.mutable + +/** + * Custom serializer used for generic spark records. If the user registers the schemas + * ahead of time, then the schema's fingerprint will be sent with each message instead of the actual + * schema, as to reduce network IO. + * Actions like parsing or compressing schemas are computationally expensive so the serializer + * caches all previously seen values as to reduce the amount of work needed to do. + * @param schemas a map where the keys are unique IDs for spark schemas and the values are the + * string representation of the Avro schema, used to decrease the amount of data + * that needs to be serialized. + */ +class SparkStructTypeSerializer(schemas: Map[Long, StructType]) extends KSerializer[HoodieSparkRecord] { + /** Used to reduce the amount of effort to compress the schema */ + private val compressCache = new mutable.HashMap[StructType, Array[Byte]]() + private val decompressCache = new mutable.HashMap[ByteBuffer, StructType]() + + /** Fingerprinting is very expensive so this alleviates most of the work */ + private val fingerprintCache = new mutable.HashMap[StructType, Long]() + private val schemaCache = new mutable.HashMap[Long, StructType]() + + // GenericAvroSerializer can't take a SparkConf in the constructor b/c then it would become + // a member of KryoSerializer, which would make KryoSerializer not Serializable. We make + // the codec lazy here just b/c in some unit tests, we use a KryoSerializer w/out having + // the SparkEnv set (note those tests would fail if they tried to serialize avro data). + private lazy val codec = CompressionCodec.createCodec(SparkEnv.get.conf) + + /** + * Used to compress Schemas when they are being sent over the wire. + * The compression results are memoized to reduce the compression time since the + * same schema is compressed many times over + */ + def compress(schema: StructType): Array[Byte] = compressCache.getOrElseUpdate(schema, { + val bos = new ByteArrayOutputStream() + val out = codec.compressedOutputStream(bos) + Utils.tryWithSafeFinally { + out.write(schema.json.getBytes(StandardCharsets.UTF_8)) + } { + out.close() + } + bos.toByteArray + }) + + /** + * Decompresses the schema into the actual in-memory object. Keeps an internal cache of already + * seen values so to limit the number of times that decompression has to be done. + */ + def decompress(schemaBytes: ByteBuffer): StructType = decompressCache.getOrElseUpdate(schemaBytes, { + val bis = new ByteArrayInputStream( + schemaBytes.array(), + schemaBytes.arrayOffset() + schemaBytes.position(), + schemaBytes.remaining()) + val in = codec.compressedInputStream(bis) + val bytes = Utils.tryWithSafeFinally { + IOUtils.toByteArray(in) + } { + in.close() + } + StructType.fromString(new String(bytes, StandardCharsets.UTF_8)) + }) + + /** + * Serializes a record to the given output stream. It caches a lot of the internal data as + * to not redo work + */ + def serializeDatum(datum: HoodieSparkRecord, output: Output): Unit = { + val schema = datum.getStructType + val fingerprint = fingerprintCache.getOrElseUpdate(schema, { + SchemaNormalization.fingerprint64(schema.json.getBytes(StandardCharsets.UTF_8)) + }) + schemas.get(fingerprint) match { + case Some(_) => + output.writeBoolean(true) + output.writeLong(fingerprint) + case None => + output.writeBoolean(false) + val compressedSchema = compress(schema) + output.writeInt(compressedSchema.length) + output.writeBytes(compressedSchema) + } + + val record = datum.newInstance().asInstanceOf[HoodieSparkRecord] + record.setStructType(null) + val stream = new ObjectOutputStream(output) + stream.writeObject(record) + stream.close() + } + + /** + * Deserializes generic records into their in-memory form. There is internal + * state to keep a cache of already seen schemas and datum readers. + */ + def deserializeDatum(input: Input): HoodieSparkRecord = { + val schema = { + if (input.readBoolean()) { + val fingerprint = input.readLong() + schemaCache.getOrElseUpdate(fingerprint, { + schemas.get(fingerprint) match { + case Some(s) => s + case None => + throw new SparkException( + "Error reading attempting to read spark data -- encountered an unknown " + + s"fingerprint: $fingerprint, not sure what schema to use. This could happen " + + "if you registered additional schemas after starting your spark context.") + } + }) + } else { + val length = input.readInt() + decompress(ByteBuffer.wrap(input.readBytes(length))) + } + } + val stream = new ObjectInputStream(input) + val record = stream.readObject().asInstanceOf[HoodieSparkRecord] + stream.close() + record.setStructType(schema) + + record + } + + override def write(kryo: Kryo, output: Output, datum: HoodieSparkRecord): Unit = + serializeDatum(datum, output) + + override def read(kryo: Kryo, input: Input, datumClass: Class[HoodieSparkRecord]): HoodieSparkRecord = + deserializeDatum(input) +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkConsistentBucketClusteringPlanStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkConsistentBucketClusteringPlanStrategy.java index 810dbc5bc75cc..694fc77205f8a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkConsistentBucketClusteringPlanStrategy.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/clustering/plan/strategy/TestSparkConsistentBucketClusteringPlanStrategy.java @@ -29,7 +29,7 @@ import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.collection.Triple; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.bucket.ConsistentBucketIdentifier; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestConsistentBucketIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestConsistentBucketIndex.java index d970d593d0fe8..a58c2ce8b5761 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestConsistentBucketIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestConsistentBucketIndex.java @@ -33,7 +33,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.HoodieParquetInputFormat; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java index e3a90becdd400..db8afa2ce1721 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java @@ -41,10 +41,10 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieFileGroupId; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; @@ -80,7 +80,7 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieLockConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.index.HoodieIndex; @@ -358,7 +358,7 @@ public void testTurnOffMetadataTableAfterEnable() throws Exception { assertTrue(metadataWriter.isPresent()); HoodieTableConfig hoodieTableConfig = - new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig.getPayloadClass()); + new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig.getPayloadClass(), writeConfig.getStringOrDefault(HoodieWriteConfig.MERGER_IMPLS)); assertFalse(hoodieTableConfig.getMetadataPartitions().isEmpty()); // Turn off metadata table @@ -375,7 +375,7 @@ public void testTurnOffMetadataTableAfterEnable() throws Exception { assertFalse(metadataWriter2.isPresent()); HoodieTableConfig hoodieTableConfig2 = - new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig2.getPayloadClass()); + new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig2.getPayloadClass(), writeConfig.getStringOrDefault(HoodieWriteConfig.MERGER_IMPLS)); assertEquals(Collections.emptySet(), hoodieTableConfig2.getMetadataPartitions()); // Assert metadata table folder is deleted assertFalse(metaClient.getFs().exists( @@ -397,7 +397,7 @@ public void testTurnOffMetadataTableAfterEnable() throws Exception { validateMetadata(testTable, true); assertTrue(metadataWriter3.isPresent()); HoodieTableConfig hoodieTableConfig3 = - new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig.getPayloadClass()); + new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig.getPayloadClass(), writeConfig.getStringOrDefault(HoodieWriteConfig.MERGER_IMPLS)); assertFalse(hoodieTableConfig3.getMetadataPartitions().isEmpty()); } @@ -1012,7 +1012,7 @@ private void verifyMetadataRawRecords(HoodieTable table, List log while (logFileReader.hasNext()) { HoodieLogBlock logBlock = logFileReader.next(); if (logBlock instanceof HoodieDataBlock) { - try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordIterator(HoodieAvroIndexedRecord::new)) { + try (ClosableIterator> recordItr = ((HoodieDataBlock) logBlock).getRecordIterator(HoodieRecordType.AVRO)) { recordItr.forEachRemaining(indexRecord -> { final GenericRecord record = (GenericRecord) indexRecord.getData(); if (enableMetaFields) { @@ -2540,7 +2540,7 @@ private void verifyMetadataColumnStatsRecords(List logFiles) thro while (logFileReader.hasNext()) { HoodieLogBlock logBlock = logFileReader.next(); if (logBlock instanceof HoodieDataBlock) { - try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordIterator(HoodieAvroIndexedRecord::new)) { + try (ClosableIterator> recordItr = ((HoodieDataBlock) logBlock).getRecordIterator(HoodieRecordType.AVRO)) { recordItr.forEachRemaining(indexRecord -> { final GenericRecord record = (GenericRecord) indexRecord.getData(); final GenericRecord colStatsRecord = (GenericRecord) record.get(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java index 432aa83d6a3d5..5efed8a461136 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java @@ -23,9 +23,9 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; @@ -353,7 +353,7 @@ private void verifyMetadataRawRecords(HoodieTable table, List log while (logFileReader.hasNext()) { HoodieLogBlock logBlock = logFileReader.next(); if (logBlock instanceof HoodieDataBlock) { - try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordIterator(HoodieAvroIndexedRecord::new)) { + try (ClosableIterator> recordItr = ((HoodieDataBlock) logBlock).getRecordIterator(HoodieRecordType.AVRO)) { recordItr.forEachRemaining(indexRecord -> { final GenericRecord record = (GenericRecord) indexRecord.getData(); assertNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java index eb950a9a77af3..d342f92011af7 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java @@ -31,6 +31,7 @@ import org.apache.hudi.client.validator.SparkPreCommitValidator; import org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator; import org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -38,14 +39,14 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieAvroRecord; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.IOType; @@ -69,6 +70,7 @@ import org.apache.hudi.common.util.ClusteringUtils; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.MarkerUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; @@ -79,7 +81,6 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodiePreCommitValidatorConfig; -import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieCommitException; @@ -443,6 +444,11 @@ public void testDeduplicationOnUpsert(boolean populateMetaFields) throws Excepti */ private void testDeduplication( Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY) + .combineInput(true, true); + addConfigsForPopulateMetaFields(configBuilder, populateMetaFields); + HoodieWriteConfig writeConfig = configBuilder.build(); + String newCommitTime = "001"; String recordKey = UUID.randomUUID().toString(); @@ -464,8 +470,8 @@ private void testDeduplication( // Global dedup should be done based on recordKey only HoodieIndex index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(true); - HoodieMerge merge = new HoodieAvroRecordMerge(); - List> dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1, merge).collectAsList(); + HoodieRecordMerger recordMerger = HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName()); + List> dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1, recordMerger, writeConfig.getProps()).collectAsList(); assertEquals(1, dedupedRecs.size()); assertEquals(dedupedRecs.get(0).getPartitionPath(), recordThree.getPartitionPath()); assertNodupesWithinPartition(dedupedRecs); @@ -473,17 +479,14 @@ private void testDeduplication( // non-Global dedup should be done based on both recordKey and partitionPath index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(false); - dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1, merge).collectAsList(); + dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1, recordMerger, writeConfig.getProps()).collectAsList(); assertEquals(2, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); // Perform write-action and check JavaRDD recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1); - HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY) - .combineInput(true, true); - addConfigsForPopulateMetaFields(configBuilder, populateMetaFields); - try (SparkRDDWriteClient client = getHoodieWriteClient(configBuilder.build());) { + try (SparkRDDWriteClient client = getHoodieWriteClient(writeConfig);) { client.startCommitWithTime(newCommitTime); List statuses = writeFn.apply(client, recordList, newCommitTime).collect(); assertNoWriteErrors(statuses); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java index 8c92f8189f762..d6d5b2495a32b 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java @@ -38,7 +38,7 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieLayoutConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex.IndexType; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java index 29c653daee61a..6bb95028523b1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.client.HoodieTimelineArchiver; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -39,7 +40,6 @@ import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieArchivalConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.metrics.HoodieMetricsConfig; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestSparkConsistentBucketClustering.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestSparkConsistentBucketClustering.java index 6763cb154f614..9dcd1d876a0e1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestSparkConsistentBucketClustering.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestSparkConsistentBucketClustering.java @@ -36,7 +36,7 @@ import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java index be80e9d378331..a60ecee1a96ae 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java @@ -23,7 +23,7 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieHBaseIndexConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java index 407fb8de0e812..43c45d00d0955 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java @@ -37,7 +37,7 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieHBaseIndexConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieSparkTable; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java index e02e613642c1a..2151b13284d26 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java @@ -33,7 +33,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndexUtils; import org.apache.hudi.keygen.BaseKeyGenerator; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java index 72749160e6bd0..05a4de483c11f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java @@ -33,7 +33,7 @@ import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.testutils.HoodieClientTestHarness; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java index d9377fc4e85c2..7789254bc79eb 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileWriterFactory.java @@ -20,6 +20,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hudi.client.SparkTaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkTable; @@ -46,26 +47,26 @@ public void testGetFileWriter() throws IOException { HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); SparkTaskContextSupplier supplier = new SparkTaskContextSupplier(); HoodieFileWriter parquetWriter = HoodieFileWriterFactory.getFileWriter(instantTime, - parquetPath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier); + parquetPath, table.getHadoopConf(), cfg.getStorageConfig(), HoodieTestDataGenerator.AVRO_SCHEMA, supplier, HoodieRecordType.AVRO); assertTrue(parquetWriter instanceof HoodieAvroParquetWriter); // hfile format. final Path hfilePath = new Path(basePath + "/partition/path/f1_1-0-1_000.hfile"); HoodieFileWriter hfileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, - hfilePath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier); + hfilePath, table.getHadoopConf(), cfg.getStorageConfig(), HoodieTestDataGenerator.AVRO_SCHEMA, supplier, HoodieRecordType.AVRO); assertTrue(hfileWriter instanceof HoodieAvroHFileWriter); // orc file format. final Path orcPath = new Path(basePath + "/partition/path/f1_1-0-1_000.orc"); HoodieFileWriter orcFileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, - orcPath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier); + orcPath, table.getHadoopConf(), cfg.getStorageConfig(), HoodieTestDataGenerator.AVRO_SCHEMA, supplier, HoodieRecordType.AVRO); assertTrue(orcFileWriter instanceof HoodieAvroOrcWriter); // other file format exception. final Path logPath = new Path(basePath + "/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1"); final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> { HoodieFileWriter logWriter = HoodieFileWriterFactory.getFileWriter(instantTime, logPath, - table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier); + table.getHadoopConf(), cfg.getStorageConfig(), HoodieTestDataGenerator.AVRO_SCHEMA, supplier, HoodieRecordType.AVRO); }, "should fail since log storage writer is not supported yet."); assertTrue(thrown.getMessage().contains("format not supported yet.")); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java index 481cda00d6e72..6d43782ee34bb 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java @@ -20,9 +20,9 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.io.storage.HoodieParquetConfig; import org.apache.hudi.testutils.HoodieClientTestHarness; @@ -116,6 +116,6 @@ private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig.Builder w writeConfig.getBloomFilterFPP(), writeConfig.getDynamicBloomFilterMaxNumEntries(), writeConfig.getBloomFilterType()); - return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, Option.of(filter), writeConfig); + return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, Option.of(filter), writeConfig.getStorageConfig()); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java index f165b48db05d1..a186070e0016a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java @@ -40,7 +40,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieLayoutConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.hadoop.HoodieParquetInputFormat; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java index 3039eb3bd9b5f..9d3fcce2d68e1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java @@ -38,7 +38,7 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieHBaseIndexConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java index a571a6f4732ea..b99abcadea0de 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java @@ -42,7 +42,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java index 59174a9371a58..c6cd554e28963 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java @@ -37,7 +37,7 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieMemoryConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.index.HoodieIndex; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java index 1c4de34e5ee38..707cf3da504c7 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java @@ -39,7 +39,7 @@ import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java index 5438fbcfc0d98..1b5d29094dc59 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java @@ -32,7 +32,7 @@ import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieSparkTable; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java index f959a8f0d9526..5c9021a5b029c 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java @@ -33,7 +33,7 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieLayoutConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java index 0a11425ec5b89..a5a5c31de2bce 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java @@ -46,7 +46,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.index.HoodieIndex; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java index 900674a677588..204654dc5de2f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java @@ -20,6 +20,7 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.HoodieCleanStat; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; @@ -41,7 +42,6 @@ import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex.IndexType; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java index cb7b2e6b3c43a..c452f413bc050 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java @@ -47,7 +47,7 @@ import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieIOException; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java index 918462ac0a08f..09e6bd699bce1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java @@ -23,7 +23,7 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index 1c638c5d63543..2af3f8f3a11b7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -19,10 +19,12 @@ package org.apache.hudi.avro; import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; @@ -113,7 +115,7 @@ public class HoodieAvroUtils { * TODO serialize other type of record. */ public static Option recordToBytes(HoodieRecord record, Schema schema) throws IOException { - return Option.of(HoodieAvroUtils.indexedRecordToBytes((IndexedRecord) record.toIndexedRecord(schema, new Properties()).get())); + return Option.of(HoodieAvroUtils.indexedRecordToBytes(((HoodieAvroIndexedRecord) record.toIndexedRecord(schema, new Properties()).get()).getData())); } /** @@ -434,14 +436,18 @@ public static GenericRecord removeFields(GenericRecord record, Set field private static void copyOldValueOrSetDefault(GenericRecord oldRecord, GenericRecord newRecord, Schema.Field field) { Schema oldSchema = oldRecord.getSchema(); - Object fieldValue = oldSchema.getField(field.name()) == null ? null : oldRecord.get(field.name()); + Field oldSchemaField = oldSchema.getField(field.name()); + Object fieldValue = oldSchemaField == null ? null : oldRecord.get(field.name()); if (fieldValue != null) { // In case field's value is a nested record, we have to rewrite it as well Object newFieldValue; if (fieldValue instanceof GenericRecord) { GenericRecord record = (GenericRecord) fieldValue; - newFieldValue = rewriteRecord(record, resolveUnionSchema(field.schema(), record.getSchema().getFullName())); + // May return null when use rewrite + String recordFullName = record.getSchema().getFullName(); + String fullName = recordFullName != null ? recordFullName : oldSchemaField.name(); + newFieldValue = rewriteRecord(record, resolveUnionSchema(field.schema(), fullName)); } else { newFieldValue = fieldValue; } @@ -696,7 +702,7 @@ public static Object getRecordColumnValues(HoodieAvroRecord record, String[] columns, Schema schema, boolean consistentLogicalTimestampEnabled) { try { - GenericRecord genericRecord = (GenericRecord) record.toIndexedRecord(schema, new Properties()).get(); + GenericRecord genericRecord = (GenericRecord) ((HoodieAvroIndexedRecord) record.toIndexedRecord(schema, new Properties()).get()).getData(); if (columns.length == 1) { return HoodieAvroUtils.getNestedFieldVal(genericRecord, columns[0], true, consistentLogicalTimestampEnabled); } else { @@ -1033,6 +1039,28 @@ public GenericRecord next() { }; } + public static HoodieRecord createHoodieRecordFromAvro( + IndexedRecord data, + String payloadClass, + String preCombineField, + Option> simpleKeyGenFieldsOpt, + Boolean withOperation, + Option partitionNameOp, + Boolean populateMetaFields) { + if (populateMetaFields) { + return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) data, + payloadClass, preCombineField, withOperation); + // Support HoodieFileSliceReader + } else if (simpleKeyGenFieldsOpt.isPresent()) { + // TODO in HoodieFileSliceReader may partitionName=option#empty + return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) data, + payloadClass, preCombineField, simpleKeyGenFieldsOpt.get(), withOperation, partitionNameOp); + } else { + return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) data, + payloadClass, preCombineField, withOperation, partitionNameOp); + } + } + public static GenericRecord rewriteRecordDeep(GenericRecord oldRecord, Schema newSchema) { return rewriteRecordWithNewSchema(oldRecord, newSchema, Collections.EMPTY_MAP); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java index 6d4236b048d2d..8aa50faaab591 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java @@ -19,10 +19,12 @@ package org.apache.hudi.common.config; import org.apache.hadoop.fs.FSDataInputStream; + import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.HoodieException; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -143,6 +145,14 @@ public List getSplitStrings(ConfigProperty configProperty, String return StringUtils.split(getString(configProperty), delimiter); } + public List getSplitStringsOrDefault(ConfigProperty configProperty) { + return getSplitStringsOrDefault(configProperty, ","); + } + + public List getSplitStringsOrDefault(ConfigProperty configProperty, String delimiter) { + return StringUtils.split(getStringOrDefault(configProperty), delimiter); + } + public String getString(String key) { return props.getProperty(key); } @@ -155,7 +165,7 @@ public Integer getInt(ConfigProperty configProperty) { public Integer getIntOrDefault(ConfigProperty configProperty) { Option rawValue = getRawValue(configProperty); return rawValue.map(v -> Integer.parseInt(v.toString())) - .orElse((Integer) configProperty.defaultValue()); + .orElse(Integer.parseInt(configProperty.defaultValue().toString())); } public Boolean getBoolean(ConfigProperty configProperty) { @@ -182,16 +192,34 @@ public Long getLong(ConfigProperty configProperty) { return rawValue.map(v -> Long.parseLong(v.toString())).orElse(null); } + public Long getLongOrDefault(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Long.parseLong(v.toString())) + .orElseGet(() -> Long.parseLong(configProperty.defaultValue().toString())); + } + public Float getFloat(ConfigProperty configProperty) { Option rawValue = getRawValue(configProperty); return rawValue.map(v -> Float.parseFloat(v.toString())).orElse(null); } + public Float getFloatOrDefault(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Float.parseFloat(v.toString())) + .orElseGet(() -> Float.parseFloat(configProperty.defaultValue().toString())); + } + public Double getDouble(ConfigProperty configProperty) { Option rawValue = getRawValue(configProperty); return rawValue.map(v -> Double.parseDouble(v.toString())).orElse(null); } + public Double getDoubleOrDefault(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Double.parseDouble(v.toString())) + .orElseGet(() -> Double.parseDouble(configProperty.defaultValue().toString())); + } + public String getStringOrDefault(ConfigProperty configProperty) { return getStringOrDefault(configProperty, configProperty.defaultValue().toString()); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java similarity index 98% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java rename to hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java index 40c53fae9686b..89841ed2fafa1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java @@ -16,12 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.config; - -import org.apache.hudi.common.config.ConfigClassProperty; -import org.apache.hudi.common.config.ConfigGroups; -import org.apache.hudi.common.config.ConfigProperty; -import org.apache.hudi.common.config.HoodieConfig; +package org.apache.hudi.common.config; import javax.annotation.concurrent.Immutable; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroIndexedRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroIndexedRecord.java index daec2fee0338f..28265bbbf41bb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroIndexedRecord.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroIndexedRecord.java @@ -19,21 +19,22 @@ package org.apache.hudi.common.model; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; import java.util.Map; import java.util.Properties; +import static org.apache.hudi.common.table.HoodieTableConfig.POPULATE_META_FIELDS; + /** * This only use by reader returning. */ @@ -48,7 +49,7 @@ public HoodieAvroIndexedRecord(HoodieKey key, IndexedRecord data) { } public HoodieAvroIndexedRecord(HoodieKey key, IndexedRecord data, HoodieOperation operation) { - super(key, data, operation, null); + super(key, data, operation); } public HoodieAvroIndexedRecord(HoodieRecord record) { @@ -58,28 +59,19 @@ public HoodieAvroIndexedRecord(HoodieRecord record) { public HoodieAvroIndexedRecord() { } - @Override - public Option toIndexedRecord(Schema schema, Properties prop) { - return Option.of(data); - } - - public Option toIndexedRecord() { - return Option.of(data); - } - @Override public HoodieRecord newInstance() { - throw new UnsupportedOperationException(); + return new HoodieAvroIndexedRecord(this); } @Override public HoodieRecord newInstance(HoodieKey key, HoodieOperation op) { - throw new UnsupportedOperationException(); + return new HoodieAvroIndexedRecord(key, data, op); } @Override public HoodieRecord newInstance(HoodieKey key) { - throw new UnsupportedOperationException(); + return new HoodieAvroIndexedRecord(key, data); } @Override @@ -87,6 +79,11 @@ public String getRecordKey(Option keyGeneratorOpt) { return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getRecordKey((GenericRecord) data) : ((GenericRecord) data).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); } + @Override + public HoodieRecordType getRecordType() { + return HoodieRecordType.AVRO; + } + @Override public String getRecordKey(String keyFieldName) { return Option.ofNullable(data.getSchema().getField(keyFieldName)) @@ -95,74 +92,80 @@ public String getRecordKey(String keyFieldName) { } @Override - public HoodieRecord mergeWith(HoodieRecord other, Schema readerSchema, Schema writerSchema) throws IOException { - ValidationUtils.checkState(other instanceof HoodieAvroIndexedRecord); - GenericRecord record = HoodieAvroUtils.stitchRecords((GenericRecord) data, (GenericRecord) other.getData(), writerSchema); - return new HoodieAvroIndexedRecord(record); + public Object getRecordColumnValues(Schema recordSchema, String[] columns, boolean consistentLogicalTimestampEnabled) { + throw new UnsupportedOperationException(); } @Override - public HoodieRecord rewriteRecord(Schema recordSchema, Schema targetSchema, TypedProperties props) throws IOException { - GenericRecord avroPayloadInNewSchema = - HoodieAvroUtils.rewriteRecord((GenericRecord) data, targetSchema); - return new HoodieAvroIndexedRecord(avroPayloadInNewSchema); + public HoodieRecord mergeWith(HoodieRecord other, Schema targetSchema) throws IOException { + GenericRecord record = HoodieAvroUtils.stitchRecords((GenericRecord) data, (GenericRecord) other.getData(), targetSchema); + return new HoodieAvroIndexedRecord(record); } @Override - public HoodieRecord rewriteRecord(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields) throws IOException { - GenericRecord rewriteRecord = schemaOnReadEnabled ? HoodieAvroUtils.rewriteRecordWithNewSchema(data, writeSchemaWithMetaFields, new HashMap<>()) - : HoodieAvroUtils.rewriteRecord((GenericRecord) data, writeSchemaWithMetaFields); - return new HoodieAvroIndexedRecord(rewriteRecord); + public HoodieRecord rewriteRecord(Schema recordSchema, Properties props, Schema targetSchema) throws IOException { + GenericRecord genericRecord = HoodieAvroUtils.rewriteRecord((GenericRecord) data, targetSchema); + return new HoodieAvroIndexedRecord(genericRecord); } @Override - public HoodieRecord rewriteRecordWithMetadata(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields, String fileName) throws IOException { - GenericRecord rewriteRecord = schemaOnReadEnabled ? HoodieAvroUtils.rewriteEvolutionRecordWithMetadata((GenericRecord) data, writeSchemaWithMetaFields, fileName) - : HoodieAvroUtils.rewriteRecordWithMetadata((GenericRecord) data, writeSchemaWithMetaFields, fileName); - return new HoodieAvroIndexedRecord(rewriteRecord); + public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties props, Schema newSchema, Map renameCols) throws IOException { + GenericRecord genericRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(data, newSchema, renameCols); + return new HoodieAvroIndexedRecord(genericRecord); } @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols) throws IOException { - GenericRecord rewriteRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(data, newSchema, renameCols); - return new HoodieAvroIndexedRecord(rewriteRecord); - } + public HoodieRecord updateValues(Schema recordSchema, Properties props, Map metadataValues) throws IOException { + metadataValues.forEach((key, value) -> { + if (value != null) { + ((GenericRecord) data).put(key, value); + } + }); - @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols, Mapper mapper) throws IOException { - GenericRecord oldRecord = (GenericRecord) getData(); - GenericRecord rewriteRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(oldRecord, newSchema, renameCols); - return mapper.apply(rewriteRecord); + return new HoodieAvroIndexedRecord(data); } @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema) throws IOException { - GenericRecord oldRecord = (GenericRecord) data; - GenericRecord rewriteRecord = HoodieAvroUtils.rewriteRecord(oldRecord, newSchema); - return new HoodieAvroIndexedRecord(rewriteRecord); + public boolean isDelete(Schema schema, Properties props) { + return false; } @Override - public HoodieRecord addMetadataValues(Schema recordSchema, Properties prop, Map metadataValues) throws IOException { - Arrays.stream(HoodieMetadataField.values()).forEach(metadataField -> { - String value = metadataValues.get(metadataField); - if (value != null) { - ((GenericRecord) data).put(metadataField.getFieldName(), value); - } - }); - - return new HoodieAvroIndexedRecord(data); + public boolean shouldIgnore(Schema schema, Properties props) throws IOException { + return getData().equals(SENTINEL); } @Override - public HoodieRecord overrideMetadataFieldValue(Schema recordSchema, Properties prop, int pos, String newValue) throws IOException { - data.put(pos, newValue); - return this; + public HoodieRecord wrapIntoHoodieRecordPayloadWithParams( + Schema schema, + Properties props, + Option> simpleKeyGenFieldsOpt, + Boolean withOperation, + Option partitionNameOp, + Boolean populateMetaFields) { + String payloadClass = ConfigUtils.getPayloadClass(props); + String preCombineField = ConfigUtils.getOrderingField(props); + return HoodieAvroUtils.createHoodieRecordFromAvro(data, payloadClass, preCombineField, simpleKeyGenFieldsOpt, withOperation, partitionNameOp, populateMetaFields); } @Override - public boolean shouldIgnore(Schema schema, Properties prop) throws IOException { - return getData().equals(SENTINEL); + public HoodieRecord wrapIntoHoodieRecordPayloadWithKeyGen(Properties props, Option keyGen) { + GenericRecord record = (GenericRecord) data; + String key; + String partition; + if (keyGen.isPresent() && !Boolean.parseBoolean(props.getOrDefault(POPULATE_META_FIELDS.key(), POPULATE_META_FIELDS.defaultValue().toString()).toString())) { + BaseKeyGenerator keyGeneratorOpt = keyGen.get(); + key = keyGeneratorOpt.getRecordKey(record); + partition = keyGeneratorOpt.getPartitionPath(record); + } else { + key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + } + HoodieKey hoodieKey = new HoodieKey(key, partition); + + HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); + return hoodieRecord; } @Override @@ -171,7 +174,17 @@ public Option> getMetadata() { } @Override - public boolean isPresent(Schema schema, Properties prop) { - return true; + public Comparable getOrderingValue(Properties props) { + boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(props.getProperty( + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())); + return (Comparable) HoodieAvroUtils.getNestedFieldVal((GenericRecord) data, + ConfigUtils.getOrderingField(props), + true, consistentLogicalTimestampEnabled); + } + + @Override + public Option toIndexedRecord(Schema schema, Properties props) { + return Option.of(this); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java index 7bc6af89f8bfd..65f15ca6a4629 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java @@ -20,33 +20,27 @@ package org.apache.hudi.common.model; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.util.HoodieRecordUtils; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; -import javax.annotation.Nonnull; - import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; import java.util.Map; import java.util.Properties; -import static org.apache.hudi.common.util.TypeUtils.unsafeCast; - public class HoodieAvroRecord extends HoodieRecord { + public HoodieAvroRecord(HoodieKey key, T data) { super(key, data); } public HoodieAvroRecord(HoodieKey key, T data, HoodieOperation operation) { - super(key, data, operation, null); + super(key, data, operation); } public HoodieAvroRecord(HoodieRecord record) { @@ -79,6 +73,11 @@ public T getData() { return data; } + @Override + public Comparable getOrderingValue(Properties props) { + return this.getData().getOrderingValue(); + } + @Override public String getRecordKey(Option keyGeneratorOpt) { return getRecordKey(); @@ -90,33 +89,22 @@ public String getRecordKey(String keyFieldName) { } @Override - public Comparable getOrderingValue() { - return data.getOrderingValue(); + public HoodieRecordType getRecordType() { + return HoodieRecordType.AVRO; } @Override - public Option toIndexedRecord(Schema schema, Properties prop) throws IOException { - return getData().getInsertValue(schema, prop); + public Object getRecordColumnValues(Schema recordSchema, String[] columns, boolean consistentLogicalTimestampEnabled) { + return HoodieAvroUtils.getRecordColumnValues(this, columns, recordSchema, consistentLogicalTimestampEnabled); } - ////////////////////////////////////////////////////////////////////////////// - - // - // NOTE: This method duplicates those ones of the HoodieRecordPayload and are placed here - // for the duration of RFC-46 implementation, until migration off `HoodieRecordPayload` - // is complete @Override - public HoodieRecord mergeWith(HoodieRecord other, Schema readerSchema, Schema writerSchema) throws IOException { - ValidationUtils.checkState(other instanceof HoodieAvroRecord); - GenericRecord mergedPayload = HoodieAvroUtils.stitchRecords( - (GenericRecord) toIndexedRecord(readerSchema, new Properties()).get(), - (GenericRecord) other.toIndexedRecord(readerSchema, new Properties()).get(), - writerSchema); - return new HoodieAvroRecord(getKey(), instantiateRecordPayloadWrapper(mergedPayload, getOrderingValue()), getOperation()); + public HoodieRecord mergeWith(HoodieRecord other, Schema targetSchema) throws IOException { + throw new UnsupportedOperationException(); } @Override - public HoodieRecord rewriteRecord(Schema recordSchema, Schema targetSchema, TypedProperties props) throws IOException { + public HoodieRecord rewriteRecord(Schema recordSchema, Properties props, Schema targetSchema) throws IOException { Option avroRecordPayloadOpt = getData().getInsertValue(recordSchema, props); GenericRecord avroPayloadInNewSchema = HoodieAvroUtils.rewriteRecord((GenericRecord) avroRecordPayloadOpt.get(), targetSchema); @@ -124,62 +112,57 @@ public HoodieRecord rewriteRecord(Schema recordSchema, Schema targetSchema, Type } @Override - public HoodieRecord rewriteRecord(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields) throws IOException { - GenericRecord record = (GenericRecord) getData().getInsertValue(recordSchema, prop).get(); - GenericRecord rewriteRecord = schemaOnReadEnabled ? HoodieAvroUtils.rewriteRecordWithNewSchema(record, writeSchemaWithMetaFields, new HashMap<>()) - : HoodieAvroUtils.rewriteRecord(record, writeSchemaWithMetaFields); + public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties props, Schema newSchema, Map renameCols) throws IOException { + GenericRecord oldRecord = (GenericRecord) getData().getInsertValue(recordSchema, props).get(); + GenericRecord rewriteRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(oldRecord, newSchema, renameCols); return new HoodieAvroRecord<>(getKey(), new RewriteAvroPayload(rewriteRecord), getOperation()); } @Override - public HoodieRecord rewriteRecordWithMetadata(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields, String fileName) throws IOException { - GenericRecord record = (GenericRecord) getData().getInsertValue(recordSchema, prop).get(); - GenericRecord rewriteRecord = schemaOnReadEnabled ? HoodieAvroUtils.rewriteEvolutionRecordWithMetadata(record, writeSchemaWithMetaFields, fileName) - : HoodieAvroUtils.rewriteRecordWithMetadata(record, writeSchemaWithMetaFields, fileName); - return new HoodieAvroRecord<>(getKey(), new RewriteAvroPayload(rewriteRecord), getOperation()); - } + public HoodieRecord updateValues(Schema recordSchema, Properties props, Map metadataValues) throws IOException { + GenericRecord avroRecordPayload = (GenericRecord) getData().getInsertValue(recordSchema, props).get(); - @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols) throws IOException { - GenericRecord oldRecord = (GenericRecord) getData().getInsertValue(recordSchema, prop).get(); - GenericRecord rewriteRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(oldRecord, newSchema, renameCols); - return new HoodieAvroRecord<>(getKey(), new RewriteAvroPayload(rewriteRecord), getOperation()); + metadataValues.forEach((key, value) -> { + if (value != null) { + avroRecordPayload.put(key, value); + } + }); + + return new HoodieAvroRecord<>(getKey(), new RewriteAvroPayload(avroRecordPayload), getOperation()); } @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema) throws IOException { - GenericRecord oldRecord = (GenericRecord) getData().getInsertValue(recordSchema, prop).get(); - GenericRecord rewriteRecord = HoodieAvroUtils.rewriteRecord(oldRecord, newSchema); - return new HoodieAvroRecord<>(getKey(), new RewriteAvroPayload(rewriteRecord), getOperation()); + public boolean isDelete(Schema schema, Properties props) throws IOException { + return !getData().getInsertValue(schema, props).isPresent(); } @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols, Mapper mapper) throws IOException { - GenericRecord oldRecord = (GenericRecord) getData().getInsertValue(recordSchema, prop).get(); - GenericRecord rewriteRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(oldRecord, newSchema, renameCols); - return mapper.apply(rewriteRecord); + public boolean shouldIgnore(Schema schema, Properties props) throws IOException { + Option insertRecord = getData().getInsertValue(schema, props); + // just skip the ignored record + if (insertRecord.isPresent() && insertRecord.get().equals(SENTINEL)) { + return true; + } else { + return false; + } } @Override - public HoodieRecord overrideMetadataFieldValue(Schema recordSchema, Properties prop, int pos, String newValue) throws IOException { - IndexedRecord record = (IndexedRecord) data.getInsertValue(recordSchema, prop).get(); - record.put(pos, newValue); - return new HoodieAvroRecord<>(getKey(), new RewriteAvroPayload((GenericRecord) record), getOperation()); + public HoodieRecord wrapIntoHoodieRecordPayloadWithParams( + Schema schema, Properties props, + Option> simpleKeyGenFieldsOpt, + Boolean withOperation, + Option partitionNameOp, + Boolean populateMetaFields) throws IOException { + IndexedRecord indexedRecord = (IndexedRecord) data.getInsertValue(schema, props).get(); + String payloadClass = ConfigUtils.getPayloadClass(props); + String preCombineField = ConfigUtils.getOrderingField(props); + return HoodieAvroUtils.createHoodieRecordFromAvro(indexedRecord, payloadClass, preCombineField, simpleKeyGenFieldsOpt, withOperation, partitionNameOp, populateMetaFields); } @Override - public HoodieRecord addMetadataValues(Schema recordSchema, Properties prop, Map metadataValues) throws IOException { - // NOTE: RewriteAvroPayload is expected here - GenericRecord avroRecordPayload = (GenericRecord) getData().getInsertValue(recordSchema, prop).get(); - - Arrays.stream(HoodieMetadataField.values()).forEach(metadataField -> { - String value = metadataValues.get(metadataField); - if (value != null) { - avroRecordPayload.put(metadataField.getFieldName(), value); - } - }); - - return new HoodieAvroRecord<>(getKey(), new RewriteAvroPayload(avroRecordPayload), getOperation()); + public HoodieRecord wrapIntoHoodieRecordPayloadWithKeyGen(Properties props, Option keyGen) { + throw new UnsupportedOperationException(); } public Option> getMetadata() { @@ -187,28 +170,12 @@ public Option> getMetadata() { } @Override - public boolean isPresent(Schema schema, Properties prop) throws IOException { - return getData().getInsertValue(schema, prop).isPresent(); - } - - @Override - public boolean shouldIgnore(Schema schema, Properties prop) throws IOException { - Option insertRecord = getData().getInsertValue(schema, prop); - // just skip the ignored record - if (insertRecord.isPresent() && insertRecord.get().equals(SENTINEL)) { - return true; + public Option toIndexedRecord(Schema schema, Properties props) throws IOException { + Option avroData = getData().getInsertValue(schema, props); + if (avroData.isPresent()) { + return Option.of(new HoodieAvroIndexedRecord(avroData.get())); } else { - return false; + return Option.empty(); } } - - @Nonnull - private T instantiateRecordPayloadWrapper(Object combinedAvroPayload, Comparable newPreCombineVal) { - return unsafeCast( - HoodieRecordUtils.loadPayload( - getData().getClass().getCanonicalName(), - new Object[]{combinedAvroPayload, newPreCombineVal}, - GenericRecord.class, - Comparable.class)); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerge.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerger.java similarity index 55% rename from hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerge.java rename to hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerger.java index bea89efdc111e..9fa70a3719bda 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerge.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerger.java @@ -20,7 +20,11 @@ import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; + +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.metadata.HoodieMetadataPayload; import java.io.IOException; @@ -28,9 +32,32 @@ import static org.apache.hudi.common.util.TypeUtils.unsafeCast; -public class HoodieAvroRecordMerge implements HoodieMerge { +public class HoodieAvroRecordMerger implements HoodieRecordMerger { + + @Override + public String getMergingStrategy() { + return StringUtils.DEFAULT_MERGER_STRATEGY_UUID; + } + @Override - public HoodieRecord preCombine(HoodieRecord older, HoodieRecord newer) { + public Option merge(HoodieRecord older, HoodieRecord newer, Schema schema, Properties props) throws IOException { + ValidationUtils.checkArgument(older.getRecordType() == HoodieRecordType.AVRO); + ValidationUtils.checkArgument(newer.getRecordType() == HoodieRecordType.AVRO); + if (older instanceof HoodieAvroRecord && newer instanceof HoodieAvroRecord) { + return Option.of(preCombine(older, newer)); + } else if (older instanceof HoodieAvroIndexedRecord && newer instanceof HoodieAvroRecord) { + return combineAndGetUpdateValue(older, newer, schema, props); + } else { + throw new UnsupportedOperationException(); + } + } + + @Override + public HoodieRecordType getRecordType() { + return HoodieRecordType.AVRO; + } + + private HoodieRecord preCombine(HoodieRecord older, HoodieRecord newer) { HoodieRecordPayload picked = unsafeCast(((HoodieAvroRecord) newer).getData().preCombine(((HoodieAvroRecord) older).getData())); if (picked instanceof HoodieMetadataPayload) { // NOTE: HoodieMetadataPayload return a new payload @@ -39,19 +66,13 @@ public HoodieRecord preCombine(HoodieRecord older, HoodieRecord newer) { return picked.equals(((HoodieAvroRecord) newer).getData()) ? newer : older; } - @Override - public Option combineAndGetUpdateValue(HoodieRecord older, HoodieRecord newer, Schema schema, Properties props) throws IOException { - Option previousRecordAvroPayload; - if (older instanceof HoodieAvroIndexedRecord) { - previousRecordAvroPayload = Option.ofNullable(((HoodieAvroIndexedRecord) older).getData()); - } else { - previousRecordAvroPayload = ((HoodieRecordPayload)older.getData()).getInsertValue(schema, props); - } + private Option combineAndGetUpdateValue(HoodieRecord older, HoodieRecord newer, Schema schema, Properties props) throws IOException { + Option previousRecordAvroPayload = older.toIndexedRecord(schema, props); if (!previousRecordAvroPayload.isPresent()) { return Option.empty(); } - return ((HoodieAvroRecord) newer).getData().combineAndGetUpdateValue(previousRecordAvroPayload.get(), schema, props) + return ((HoodieAvroRecord) newer).getData().combineAndGetUpdateValue(previousRecordAvroPayload.get().getData(), schema, props) .map(combinedAvroPayload -> new HoodieAvroIndexedRecord((IndexedRecord) combinedAvroPayload)); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieEmptyRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieEmptyRecord.java new file mode 100644 index 0000000000000..f2167bb15eb76 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieEmptyRecord.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.keygen.BaseKeyGenerator; + +import org.apache.avro.Schema; + +import java.io.IOException; +import java.util.Map; +import java.util.Properties; + +public class HoodieEmptyRecord extends HoodieRecord { + + private final HoodieRecordType type; + private final Comparable orderingVal; + + public HoodieEmptyRecord(HoodieKey key, HoodieRecordType type) { + super(key, null); + this.type = type; + this.orderingVal = null; + } + + public HoodieEmptyRecord(HoodieKey key, HoodieOperation operation, Comparable orderingVal, HoodieRecordType type) { + super(key, null, operation); + this.type = type; + this.orderingVal = orderingVal; + } + + public HoodieEmptyRecord(HoodieRecord record, HoodieRecordType type) { + super(record); + this.type = type; + this.orderingVal = record.getOrderingValue(new Properties()); + } + + public HoodieEmptyRecord(HoodieRecordType type) { + this.type = type; + this.orderingVal = null; + } + + @Override + public T getData() { + return null; + } + + @Override + public Comparable getOrderingValue(Properties props) { + return orderingVal; + } + + @Override + public HoodieRecord newInstance() { + return this; + } + + @Override + public HoodieRecord newInstance(HoodieKey key, HoodieOperation op) { + return new HoodieEmptyRecord<>(key, op, orderingVal, type); + } + + @Override + public HoodieRecord newInstance(HoodieKey key) { + return new HoodieEmptyRecord<>(key, type); + } + + @Override + public HoodieRecordType getRecordType() { + return type; + } + + @Override + public String getRecordKey(Option keyGeneratorOpt) { + return key.getRecordKey(); + } + + @Override + public String getRecordKey(String keyFieldName) { + return key.getRecordKey(); + } + + @Override + public Object getRecordColumnValues(Schema recordSchema, String[] columns, boolean consistentLogicalTimestampEnabled) { + throw new UnsupportedOperationException(); + } + + @Override + public HoodieRecord mergeWith(HoodieRecord other, Schema targetSchema) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public HoodieRecord rewriteRecord(Schema recordSchema, Properties props, Schema targetSchema) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties props, Schema newSchema, Map renameCols) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public HoodieRecord updateValues(Schema recordSchema, Properties props, Map metadataValues) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isDelete(Schema schema, Properties props) throws IOException { + return true; + } + + @Override + public boolean shouldIgnore(Schema schema, Properties props) throws IOException { + return false; + } + + @Override + public HoodieRecord wrapIntoHoodieRecordPayloadWithParams(Schema schema, Properties props, Option> simpleKeyGenFieldsOpt, Boolean withOperation, Option partitionNameOp, + Boolean populateMetaFieldsOp) + throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public HoodieRecord wrapIntoHoodieRecordPayloadWithKeyGen(Properties props, Option keyGen) { + throw new UnsupportedOperationException(); + } + + @Override + public Option toIndexedRecord(Schema schema, Properties props) throws IOException { + return Option.empty(); + } + + @Override + public Option> getMetadata() { + return Option.empty(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java index 8ab6a245c4745..22807d927e8e3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java @@ -18,10 +18,10 @@ package org.apache.hudi.common.model; +import java.util.Collections; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hudi.common.config.TypedProperties; + import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; @@ -40,7 +40,7 @@ /** * A Single Record managed by Hoodie. */ -public abstract class HoodieRecord implements Serializable { +public abstract class HoodieRecord implements HoodieRecordCompatibilityInterface, Serializable { public static final String COMMIT_TIME_METADATA_FIELD = HoodieMetadataField.COMMIT_TIME_METADATA_FIELD.getFieldName(); public static final String COMMIT_SEQNO_METADATA_FIELD = HoodieMetadataField.COMMIT_SEQNO_METADATA_FIELD.getFieldName(); @@ -109,7 +109,7 @@ public String getFieldName() { /** * Identifies the record across the table. */ - private HoodieKey key; + protected HoodieKey key; /** * Actual payload of the record. @@ -136,32 +136,21 @@ public String getFieldName() { */ private HoodieOperation operation; - /** - * For purposes of preCombining. - */ - private Comparable orderingVal; - public HoodieRecord(HoodieKey key, T data) { - this(key, data, null, null); + this(key, data, null); } - public HoodieRecord(HoodieKey key, T data, Comparable orderingVal) { - this(key, data, null, orderingVal); - } - - public HoodieRecord(HoodieKey key, T data, HoodieOperation operation, Comparable orderingVal) { + public HoodieRecord(HoodieKey key, T data, HoodieOperation operation) { this.key = key; this.data = data; this.currentLocation = null; this.newLocation = null; this.sealed = false; this.operation = operation; - // default natural order is 0 - this.orderingVal = orderingVal == null ? 0 : orderingVal; } public HoodieRecord(HoodieRecord record) { - this(record.key, record.data, record.operation, record.orderingVal); + this(record.key, record.data, record.operation); this.currentLocation = record.currentLocation; this.newLocation = record.newLocation; this.sealed = record.sealed; @@ -184,13 +173,11 @@ public HoodieOperation getOperation() { return operation; } - public Comparable getOrderingValue() { - return orderingVal; - } + public abstract Comparable getOrderingValue(Properties props); public T getData() { if (data == null) { - throw new IllegalStateException("HoodieRecord already deflated for record."); + throw new IllegalStateException("Payload already deflated for record."); } return data; } @@ -213,6 +200,10 @@ public HoodieRecord setCurrentLocation(HoodieRecordLocation location) { return this; } + public void setData(T data) { + this.data = data; + } + public HoodieRecordLocation getCurrentLocation() { return currentLocation; } @@ -273,6 +264,8 @@ public String getRecordKey() { return key.getRecordKey(); } + public abstract HoodieRecordType getRecordType(); + public abstract String getRecordKey(Option keyGeneratorOpt); public abstract String getRecordKey(String keyFieldName); @@ -291,59 +284,51 @@ public void checkState() { } } - ////////////////////////////////////////////////////////////////////////////// - - // - // NOTE: This method duplicates those ones of the HoodieRecordPayload and are placed here - // for the duration of RFC-46 implementation, until migration off `HoodieRecordPayload` - // is complete - // - public abstract HoodieRecord mergeWith(HoodieRecord other, Schema readerSchema, Schema writerSchema) throws IOException; + /** + * Get column in record to support RDDCustomColumnsSortPartitioner + */ + public abstract Object getRecordColumnValues(Schema recordSchema, String[] columns, boolean consistentLogicalTimestampEnabled); - public abstract HoodieRecord rewriteRecord(Schema recordSchema, Schema targetSchema, TypedProperties props) throws IOException; + /** + * Support bootstrap. + */ + public abstract HoodieRecord mergeWith(HoodieRecord other, Schema targetSchema) throws IOException; /** - * Rewrite the GenericRecord with the Schema containing the Hoodie Metadata fields. + * Rewrite record into new schema(add meta columns) */ - public abstract HoodieRecord rewriteRecord(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields) throws IOException; + public abstract HoodieRecord rewriteRecord(Schema recordSchema, Properties props, Schema targetSchema) throws IOException; - public abstract HoodieRecord rewriteRecordWithMetadata(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields, String fileName) throws IOException; + /** + * Support schema evolution. + */ + public abstract HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties props, Schema newSchema, Map renameCols) throws IOException; - public abstract HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols) throws IOException; + public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties props, Schema newSchema) throws IOException { + return rewriteRecordWithNewSchema(recordSchema, props, newSchema, Collections.emptyMap()); + } - public abstract HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols, Mapper mapper) throws IOException; + /** + * This method could change in the future. + * @temporary + */ + public abstract HoodieRecord updateValues(Schema recordSchema, Properties props, Map metadataValues) throws IOException; - public abstract HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema) throws IOException; + public abstract boolean isDelete(Schema schema, Properties props) throws IOException; - public abstract HoodieRecord overrideMetadataFieldValue(Schema recordSchema, Properties prop, int pos, String newValue) throws IOException; + /** + * Is EmptyRecord. Generated by ExpressionPayload. + */ + public abstract boolean shouldIgnore(Schema schema, Properties props) throws IOException; - public abstract HoodieRecord addMetadataValues(Schema recordSchema, Properties prop, Map metadataValues) throws IOException; + public abstract Option toIndexedRecord(Schema schema, Properties props) throws IOException; public abstract Option> getMetadata(); - public abstract boolean isPresent(Schema schema, Properties prop) throws IOException; - - public abstract boolean shouldIgnore(Schema schema, Properties prop) throws IOException; - - public abstract Option toIndexedRecord(Schema schema, Properties prop) throws IOException; - - ////////////////////////////////////////////////////////////////////////////// - public static String generateSequenceId(String instantTime, int partitionId, long recordIndex) { return instantTime + "_" + partitionId + "_" + recordIndex; } - /** - * NOTE: This is temporary transition construct to be able to construct - * HoodieRecord instances w/o excessive wiring into a lot of components - * a lot of details that are irrelevant for these - * TODO remove - */ - @FunctionalInterface - public interface Mapper { - HoodieRecord apply(IndexedRecord avroPayload); - } - /** * A special record returned by {@link HoodieRecordPayload}, which means we should just skip this record. * This record is only used for {@link HoodieRecordPayload} currently, so it should not @@ -384,4 +369,8 @@ public Object get(String key) { return null; } } + + public enum HoodieRecordType { + AVRO, SPARK + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieMerge.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordCompatibilityInterface.java similarity index 56% rename from hudi-common/src/main/java/org/apache/hudi/common/model/HoodieMerge.java rename to hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordCompatibilityInterface.java index 6becf355914c5..c2eb164dcac40 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieMerge.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordCompatibilityInterface.java @@ -18,21 +18,28 @@ package org.apache.hudi.common.model; +import java.io.IOException; +import java.util.Properties; import org.apache.avro.Schema; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.keygen.BaseKeyGenerator; -import java.io.IOException; -import java.io.Serializable; -import java.util.Properties; +public interface HoodieRecordCompatibilityInterface { -/** - * HoodieMerge defines how to merge two records. It is a stateless component. - * It can implement the merging logic of HoodieRecord of different engines - * and avoid the performance consumption caused by the serialization/deserialization of Avro payload. - */ -public interface HoodieMerge extends Serializable { - - HoodieRecord preCombine(HoodieRecord older, HoodieRecord newer); + /** + * This method used to extract HoodieKey not through keyGenerator. + */ + HoodieRecord wrapIntoHoodieRecordPayloadWithParams( + Schema schema, + Properties props, + Option> simpleKeyGenFieldsOpt, + Boolean withOperation, + Option partitionNameOp, + Boolean populateMetaFieldsOp) throws IOException; - Option combineAndGetUpdateValue(HoodieRecord older, HoodieRecord newer, Schema schema, Properties props) throws IOException; + /** + * This method used to extract HoodieKey through keyGenerator. This method used in ClusteringExecutionStrategy. + */ + HoodieRecord wrapIntoHoodieRecordPayloadWithKeyGen(Properties props, Option keyGen); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordMerger.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordMerger.java new file mode 100644 index 0000000000000..43665b571102c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordMerger.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; + +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.util.Option; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Properties; + +/** + * HoodieMerge defines how to merge two records. It is a stateless component. + * It can implement the merging logic of HoodieRecord of different engines + * and avoid the performance consumption caused by the serialization/deserialization of Avro payload. + */ +@PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) +public interface HoodieRecordMerger extends Serializable { + + /** + * This method converges combineAndGetUpdateValue and precombine from HoodiePayload. + * It'd be associative operation: f(a, f(b, c)) = f(f(a, b), c) (which we can translate as having 3 versions A, B, C + * of the single record, both orders of operations applications have to yield the same result) + */ + Option merge(HoodieRecord older, HoodieRecord newer, Schema schema, Properties props) throws IOException; + + /** + * The record type handled by the current merger. + * SPARK, AVRO, FLINK + */ + HoodieRecordType getRecordType(); + + /** + * The kind of merging strategy this recordMerger belongs to. An UUID represents merging strategy. + */ + String getMergingStrategy(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index aae29a21bb116..4b1a0bb0cb667 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -26,7 +26,6 @@ import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.config.OrderedProperties; import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -156,11 +155,10 @@ public class HoodieTableConfig extends HoodieConfig { .withDocumentation("Payload class to use for performing compactions, i.e merge delta logs with current base file and then " + " produce a new base file."); - public static final ConfigProperty MERGE_CLASS_NAME = ConfigProperty - .key("hoodie.compaction.merge.class") - .defaultValue(HoodieAvroRecordMerge.class.getName()) - .withDocumentation("Merge class provide stateless component interface for merging records, and support various HoodieRecord " - + "types, such as Spark records or Flink records."); + public static final ConfigProperty MERGER_STRATEGY = ConfigProperty + .key("hoodie.compaction.merger.strategy") + .defaultValue(StringUtils.DEFAULT_MERGER_STRATEGY_UUID) + .withDocumentation("Id of merger strategy. Hudi will pick RecordMergers in hoodie.datasource.write.merger.impls which has the same merger strategy id"); public static final ConfigProperty ARCHIVELOG_FOLDER = ConfigProperty .key("hoodie.archivelog.folder") @@ -244,15 +242,24 @@ public class HoodieTableConfig extends HoodieConfig { private static final String TABLE_CHECKSUM_FORMAT = "%s.%s"; // . - public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName) { + public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName, String mergerStrategy) { super(); Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); LOG.info("Loading table properties from " + propertyPath); try { fetchConfigs(fs, metaPath); + boolean needStore = false; if (contains(PAYLOAD_CLASS_NAME) && payloadClassName != null && !getString(PAYLOAD_CLASS_NAME).equals(payloadClassName)) { setValue(PAYLOAD_CLASS_NAME, payloadClassName); + needStore = true; + } + if (contains(MERGER_STRATEGY) && payloadClassName != null + && !getString(MERGER_STRATEGY).equals(mergerStrategy)) { + setValue(MERGER_STRATEGY, mergerStrategy); + needStore = true; + } + if (needStore) { // FIXME(vc): wonder if this can be removed. Need to look into history. try (FSDataOutputStream outputStream = fs.create(propertyPath)) { storeProperties(props, outputStream); @@ -419,6 +426,7 @@ public static void create(FileSystem fs, Path metadataFolder, Properties propert hoodieConfig.setDefaultValue(TYPE); if (hoodieConfig.getString(TYPE).equals(HoodieTableType.MERGE_ON_READ.name())) { hoodieConfig.setDefaultValue(PAYLOAD_CLASS_NAME); + hoodieConfig.setDefaultValue(MERGER_STRATEGY); } hoodieConfig.setDefaultValue(ARCHIVELOG_FOLDER); if (!hoodieConfig.contains(TIMELINE_LAYOUT_VERSION)) { @@ -489,11 +497,10 @@ public String getPayloadClass() { } /** - * Read the hoodie merge class for HoodieRecords from the table properties. + * Read the payload class for HoodieRecords from the table properties. */ - public String getMergeClass() { - return getStringOrDefault(MERGE_CLASS_NAME).replace("com.uber.hoodie", - "org.apache.hudi"); + public String getMergerStrategy() { + return getStringOrDefault(MERGER_STRATEGY); } public String getPreCombineField() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index e58e89b36bd6b..d6b556e46330c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -102,7 +102,6 @@ public class HoodieTableMetaClient implements Serializable { // computations secured by its immutability protected SerializablePath basePath; protected SerializablePath metaPath; - private transient HoodieWrapperFileSystem fs; private boolean loadActiveTimelineOnLoad; protected SerializableConfiguration hadoopConf; @@ -117,7 +116,7 @@ public class HoodieTableMetaClient implements Serializable { protected HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, ConsistencyGuardConfig consistencyGuardConfig, Option layoutVersion, - String payloadClassName, FileSystemRetryConfig fileSystemRetryConfig) { + String payloadClassName, String mergerStrategy, FileSystemRetryConfig fileSystemRetryConfig) { LOG.info("Loading HoodieTableMetaClient from " + basePath); this.consistencyGuardConfig = consistencyGuardConfig; this.fileSystemRetryConfig = fileSystemRetryConfig; @@ -126,7 +125,7 @@ protected HoodieTableMetaClient(Configuration conf, String basePath, boolean loa this.metaPath = new SerializablePath(new CachingPath(basePath, METAFOLDER_NAME)); this.fs = getFs(); TableNotFoundException.checkTableValidity(fs, this.basePath.get(), metaPath.get()); - this.tableConfig = new HoodieTableConfig(fs, metaPath.toString(), payloadClassName); + this.tableConfig = new HoodieTableConfig(fs, metaPath.toString(), payloadClassName, mergerStrategy); this.tableType = tableConfig.getTableType(); Option tableConfigVersion = tableConfig.getTimelineLayoutVersion(); if (layoutVersion.isPresent() && tableConfigVersion.isPresent()) { @@ -161,6 +160,7 @@ public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) .setConsistencyGuardConfig(oldMetaClient.consistencyGuardConfig) .setLayoutVersion(Option.of(oldMetaClient.timelineLayoutVersion)) .setPayloadClassName(null) + .setMergerStrategy(null) .setFileSystemRetryConfig(oldMetaClient.fileSystemRetryConfig).build(); } @@ -634,7 +634,7 @@ public void initializeBootstrapDirsIfNotExists() throws IOException { private static HoodieTableMetaClient newMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, ConsistencyGuardConfig consistencyGuardConfig, Option layoutVersion, - String payloadClassName, FileSystemRetryConfig fileSystemRetryConfig, Properties props) { + String payloadClassName, String mergerStrategy, FileSystemRetryConfig fileSystemRetryConfig, Properties props) { HoodieMetastoreConfig metastoreConfig = null == props ? new HoodieMetastoreConfig.Builder().build() : new HoodieMetastoreConfig.Builder().fromProperties(props).build(); @@ -644,7 +644,7 @@ private static HoodieTableMetaClient newMetaClient(Configuration conf, String ba conf, consistencyGuardConfig, fileSystemRetryConfig, props.getProperty(HoodieTableConfig.DATABASE_NAME.key()), props.getProperty(HoodieTableConfig.NAME.key()), metastoreConfig) : new HoodieTableMetaClient(conf, basePath, - loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName, fileSystemRetryConfig); + loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName, mergerStrategy, fileSystemRetryConfig); } public static Builder builder() { @@ -660,6 +660,7 @@ public static class Builder { private String basePath; private boolean loadActiveTimelineOnLoad = false; private String payloadClassName = null; + private String mergerStrategy = null; private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); private FileSystemRetryConfig fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().build(); private Option layoutVersion = Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION); @@ -685,6 +686,11 @@ public Builder setPayloadClassName(String payloadClassName) { return this; } + public Builder setMergerStrategy(String mergerStrategy) { + this.mergerStrategy = mergerStrategy; + return this; + } + public Builder setConsistencyGuardConfig(ConsistencyGuardConfig consistencyGuardConfig) { this.consistencyGuardConfig = consistencyGuardConfig; return this; @@ -709,7 +715,8 @@ public HoodieTableMetaClient build() { ValidationUtils.checkArgument(conf != null, "Configuration needs to be set to init HoodieTableMetaClient"); ValidationUtils.checkArgument(basePath != null, "basePath needs to be set to init HoodieTableMetaClient"); return newMetaClient(conf, basePath, - loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName, fileSystemRetryConfig, props); + loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName, + mergerStrategy, fileSystemRetryConfig, props); } } @@ -726,7 +733,7 @@ public static class PropertyBuilder { private String recordKeyFields; private String archiveLogFolder; private String payloadClassName; - private String mergeClassName; + private String mergerStrategy; private Integer timelineLayoutVersion; private String baseFileFormat; private String preCombineField; @@ -793,8 +800,8 @@ public PropertyBuilder setPayloadClassName(String payloadClassName) { return this; } - public PropertyBuilder setMergeClassName(String mergeClassName) { - this.mergeClassName = mergeClassName; + public PropertyBuilder setMergerStrategy(String mergerStrategy) { + this.mergerStrategy = mergerStrategy; return this; } @@ -900,9 +907,10 @@ public PropertyBuilder set(Map props) { public PropertyBuilder fromMetaClient(HoodieTableMetaClient metaClient) { return setTableType(metaClient.getTableType()) - .setTableName(metaClient.getTableConfig().getTableName()) - .setArchiveLogFolder(metaClient.getArchivePath()) - .setPayloadClassName(metaClient.getTableConfig().getPayloadClass()); + .setTableName(metaClient.getTableConfig().getTableName()) + .setArchiveLogFolder(metaClient.getArchivePath()) + .setPayloadClassName(metaClient.getTableConfig().getPayloadClass()) + .setMergerStrategy(metaClient.getTableConfig().getMergerStrategy()); } public PropertyBuilder fromProperties(Properties properties) { @@ -932,6 +940,10 @@ public PropertyBuilder fromProperties(Properties properties) { setPayloadClassName( hoodieConfig.getString(HoodieTableConfig.PAYLOAD_CLASS_NAME)); } + if (hoodieConfig.contains(HoodieTableConfig.MERGER_STRATEGY)) { + setMergerStrategy( + hoodieConfig.getString(HoodieTableConfig.MERGER_STRATEGY)); + } if (hoodieConfig.contains(HoodieTableConfig.TIMELINE_LAYOUT_VERSION)) { setTimelineLayoutVersion(hoodieConfig.getInt(HoodieTableConfig.TIMELINE_LAYOUT_VERSION)); } @@ -1009,9 +1021,8 @@ public Properties build() { if (tableType == HoodieTableType.MERGE_ON_READ && payloadClassName != null) { tableConfig.setValue(HoodieTableConfig.PAYLOAD_CLASS_NAME, payloadClassName); } - - if (mergeClassName != null) { - tableConfig.setValue(HoodieTableConfig.MERGE_CLASS_NAME, mergeClassName); + if (tableType == HoodieTableType.MERGE_ON_READ && mergerStrategy != null) { + tableConfig.setValue(HoodieTableConfig.MERGER_STRATEGY, mergerStrategy); } if (null != tableCreateSchema) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java index a41e5a1906621..65defdc2bdb13 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java @@ -19,14 +19,13 @@ package org.apache.hudi.common.table.log; import org.apache.hudi.common.model.DeleteRecord; -import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; -import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock.RecordIterator; import org.apache.hudi.common.table.log.block.HoodieCommandBlock; import org.apache.hudi.common.table.log.block.HoodieDataBlock; import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; @@ -36,7 +35,6 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.InternalSchemaCache; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; @@ -44,8 +42,6 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hudi.internal.schema.InternalSchema; @@ -57,7 +53,6 @@ import java.io.IOException; import java.util.ArrayDeque; import java.util.Arrays; -import java.util.Collections; import java.util.Deque; import java.util.HashSet; import java.util.List; @@ -66,6 +61,7 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME; import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.COMMAND_BLOCK; import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK; @@ -91,13 +87,13 @@ public abstract class AbstractHoodieLogRecordReader { // Latest valid instant time // Log-Blocks belonging to inflight delta-instants are filtered-out using this high-watermark. private final String latestInstantTime; - private final HoodieTableMetaClient hoodieTableMetaClient; + protected final HoodieTableMetaClient hoodieTableMetaClient; // Merge strategy to use when combining records from log private final String payloadClassFQN; // preCombine field - private final String preCombineField; + protected final String preCombineField; // Stateless component for merging records - private final String mergeClassFQN; + protected final HoodieRecordMerger recordMerger; // simple key gen fields private Option> simpleKeyGenFields = Option.empty(); // Log File Paths @@ -140,21 +136,23 @@ public abstract class AbstractHoodieLogRecordReader { private Option partitionName; // Populate meta fields for the records private boolean populateMetaFields = true; + // Record type read from log block + protected final HoodieRecordType recordType; protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize, Option instantRange, - boolean withOperationField) { + boolean withOperationField, HoodieRecordMerger recordMerger) { this(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, - instantRange, withOperationField, true, Option.empty(), InternalSchema.getEmptyInternalSchema()); + instantRange, withOperationField, true, Option.empty(), InternalSchema.getEmptyInternalSchema(), recordMerger); } protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize, Option instantRange, boolean withOperationField, boolean forceFullScan, - Option partitionName, InternalSchema internalSchema) { + Option partitionName, InternalSchema internalSchema, HoodieRecordMerger recordMerger) { this.readerSchema = readerSchema; this.latestInstantTime = latestInstantTime; this.hoodieTableMetaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).build(); @@ -162,7 +160,7 @@ protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List keySpecOpt) throws Exception { - HoodieRecord.Mapper mapper = (rec) -> createHoodieRecord(rec, this.hoodieTableMetaClient.getTableConfig(), - this.payloadClassFQN, this.preCombineField, this.withOperationField, this.simpleKeyGenFields, this.partitionName); - - try (ClosableIterator recordIterator = getRecordsIterator(dataBlock, keySpecOpt, mapper)) { + try (ClosableIterator recordIterator = getRecordsIterator(dataBlock, keySpecOpt, recordType)) { Option schemaOption = getMergedSchema(dataBlock); - Schema finalReadSchema; - if (recordIterator instanceof RecordIterator) { - finalReadSchema = ((RecordIterator) recordIterator).getFinalReadSchema(); - } else { - finalReadSchema = dataBlock.getSchema(); - } while (recordIterator.hasNext()) { HoodieRecord currentRecord = recordIterator.next(); - HoodieRecord record = schemaOption.isPresent() - ? currentRecord.rewriteRecordWithNewSchema(finalReadSchema, new Properties(), schemaOption.get(), Collections.emptyMap(), mapper) : currentRecord; - processNextRecord(record); + Schema schema = schemaOption.isPresent() ? schemaOption.get() : dataBlock.getSchema(); + HoodieRecord record = schemaOption.isPresent() ? currentRecord.rewriteRecordWithNewSchema(dataBlock.getSchema(), new Properties(), schemaOption.get()) : currentRecord; + HoodieRecord completedRecord = record.wrapIntoHoodieRecordPayloadWithParams(schema, hoodieTableMetaClient.getTableConfig().getProps(), this.simpleKeyGenFields, + this.withOperationField, this.partitionName, getPopulateMetaFields()); + processNextRecord(completedRecord); totalLogRecords.incrementAndGet(); } } } + protected boolean getPopulateMetaFields() { + return this.populateMetaFields; + } + /** * Get final Read Schema for support evolution. * step1: find the fileSchema for current dataBlock. @@ -423,38 +419,12 @@ private Option getMergedSchema(HoodieDataBlock dataBlock) { return result; } - /** - * Create @{@link HoodieRecord} from the @{@link IndexedRecord}. - * - * @param rec - IndexedRecord to create the HoodieRecord from - * @param hoodieTableConfig - Table config - * @param payloadClassFQN - Payload class fully qualified name - * @param preCombineField - PreCombine field - * @param withOperationField - Whether operation field is enabled - * @param simpleKeyGenFields - Key generator fields when populate meta fields is tuened off - * @param partitionName - Partition name - * @return HoodieRecord created from the IndexedRecord - */ - protected HoodieAvroRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, - final String payloadClassFQN, final String preCombineField, - final boolean withOperationField, - final Option> simpleKeyGenFields, - final Option partitionName) { - if (this.populateMetaFields) { - return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, payloadClassFQN, - preCombineField, withOperationField); - } else { - return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, payloadClassFQN, - preCombineField, simpleKeyGenFields.get(), withOperationField, partitionName); - } - } - /** * Process next record. * * @param hoodieRecord Hoodie Record to process */ - protected abstract void processNextRecord(HoodieRecord hoodieRecord) throws Exception; + protected abstract void processNextRecord(HoodieRecord hoodieRecord) throws Exception; /** * Process next deleted record. @@ -496,13 +466,15 @@ private void processQueuedBlocksForInstant(Deque logBlocks, int progress = (numLogFilesSeen - 1) / logFilePaths.size(); } - private ClosableIterator getRecordsIterator(HoodieDataBlock dataBlock, Option keySpecOpt, HoodieRecord.Mapper mapper) throws IOException { + private ClosableIterator getRecordsIterator(HoodieDataBlock dataBlock, Option keySpecOpt, HoodieRecordType type) throws IOException { + ClosableIterator iter; if (keySpecOpt.isPresent()) { KeySpec keySpec = keySpecOpt.get(); - return dataBlock.getRecordIterator(keySpec.keys, keySpec.fullKey, mapper); + iter = unsafeCast(dataBlock.getRecordIterator(keySpec.keys, keySpec.fullKey, type)); + } else { + iter = unsafeCast(dataBlock.getRecordIterator(type)); } - - return dataBlock.getRecordIterator(mapper); + return iter; } /** @@ -528,10 +500,6 @@ protected String getPayloadClassFQN() { return payloadClassFQN; } - protected String getMergeClassFQN() { - return mergeClassFQN; - } - public Option getPartitionName() { return partitionName; } @@ -591,6 +559,10 @@ public Builder withOperationField(boolean withOperationField) { throw new UnsupportedOperationException(); } + public Builder withRecordMerger(HoodieRecordMerger recordMerger) { + throw new UnsupportedOperationException(); + } + public abstract AbstractHoodieLogRecordReader build(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java index dfc3c14b5b83e..f39cb1fc02397 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java @@ -20,27 +20,25 @@ import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.model.DeleteRecord; -import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieEmptyRecord; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; -import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.HoodieRecordSizeEstimator; -import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.SpillableMapUtils; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.internal.schema.InternalSchema; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; -import org.apache.hudi.internal.schema.InternalSchema; - import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -80,8 +78,6 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader // Stores the total time taken to perform reading and merging of log blocks private long totalTimeTakenToReadAndMergeBlocks; - private final HoodieMerge merge; - @SuppressWarnings("unchecked") protected HoodieMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, String latestInstantTime, Long maxMemorySizeInBytes, boolean readBlocksLazily, @@ -90,16 +86,14 @@ protected HoodieMergedLogRecordScanner(FileSystem fs, String basePath, List partitionName, InternalSchema internalSchema) { + Option partitionName, InternalSchema internalSchema, HoodieRecordMerger recordMerger) { super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, - instantRange, withOperationField, - forceFullScan, partitionName, internalSchema); + instantRange, withOperationField, forceFullScan, partitionName, internalSchema, recordMerger); try { // Store merged records for all versions for this log file, set the in-memory footprint to maxInMemoryMapSize this.records = new ExternalSpillableMap<>(maxMemorySizeInBytes, spillableMapBasePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(readerSchema), diskMapType, isBitCaskDiskMapCompressionEnabled); this.maxMemorySizeInBytes = maxMemorySizeInBytes; - this.merge = HoodieRecordUtils.loadMerge(getMergeClassFQN()); } catch (IOException e) { throw new HoodieIOException("IOException when creating ExternalSpillableMap at " + spillableMapBasePath, e); } @@ -147,19 +141,19 @@ public static HoodieMergedLogRecordScanner.Builder newBuilder() { } @Override - protected void processNextRecord(HoodieRecord hoodieRecord) throws IOException { + protected void processNextRecord(HoodieRecord hoodieRecord) throws IOException { String key = hoodieRecord.getRecordKey(); if (records.containsKey(key)) { // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be // done when a DELETE (empty payload) is encountered before or after an insert/update. - HoodieRecord oldRecord = records.get(key); - HoodieRecordPayload oldValue = oldRecord.getData(); - HoodieRecordPayload combinedValue = (HoodieRecordPayload) merge.preCombine(oldRecord, hoodieRecord).getData(); + HoodieRecord oldRecord = records.get(key); + T oldValue = oldRecord.getData(); + T combinedValue = ((HoodieRecord) recordMerger.merge(oldRecord, hoodieRecord, readerSchema, this.hoodieTableMetaClient.getTableConfig().getProps()).get()).getData(); // If combinedValue is oldValue, no need rePut oldRecord if (combinedValue != oldValue) { - HoodieOperation operation = hoodieRecord.getOperation(); - records.put(key, new HoodieAvroRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue, operation)); + hoodieRecord.setData(combinedValue); + records.put(key, hoodieRecord); } } else { // Put the record as is @@ -170,13 +164,14 @@ protected void processNextRecord(HoodieRecord hoodieRecord) throws IOException { @Override protected void processNextDeletedRecord(DeleteRecord deleteRecord) { String key = deleteRecord.getRecordKey(); - HoodieRecord oldRecord = records.get(key); + HoodieRecord oldRecord = records.get(key); if (oldRecord != null) { // Merge and store the merged record. The ordering val is taken to decide whether the same key record // should be deleted or be kept. The old record is kept only if the DELETE record has smaller ordering val. // For same ordering values, uses the natural order(arrival time semantics). - Comparable curOrderingVal = oldRecord.getOrderingValue(); + Comparable curOrderingVal = oldRecord.getOrderingValue( + this.hoodieTableMetaClient.getTableConfig().getProps()); Comparable deleteOrderingVal = deleteRecord.getOrderingValue(); // Checks the ordering value does not equal to 0 // because we use 0 as the default value which means natural order @@ -189,8 +184,13 @@ protected void processNextDeletedRecord(DeleteRecord deleteRecord) { } } // Put the DELETE record - records.put(key, SpillableMapUtils.generateEmptyPayload(key, - deleteRecord.getPartitionPath(), deleteRecord.getOrderingValue(), getPayloadClassFQN())); + if (recordType == HoodieRecordType.AVRO) { + records.put(key, SpillableMapUtils.generateEmptyPayload(key, + deleteRecord.getPartitionPath(), deleteRecord.getOrderingValue(), getPayloadClassFQN())); + } else { + HoodieEmptyRecord record = new HoodieEmptyRecord<>(new HoodieKey(key, deleteRecord.getPartitionPath()), null, deleteRecord.getOrderingValue(), recordType); + records.put(key, record); + } } public long getTotalTimeTakenToReadAndMergeBlocks() { @@ -226,6 +226,7 @@ public static class Builder extends AbstractHoodieLogRecordReader.Builder { protected String partitionName; // operation field default false private boolean withOperationField = false; + private HoodieRecordMerger recordMerger; @Override public Builder withFileSystem(FileSystem fs) { @@ -317,16 +318,24 @@ public Builder withPartition(String partitionName) { return this; } + @Override + public Builder withRecordMerger(HoodieRecordMerger recordMerger) { + this.recordMerger = recordMerger; + return this; + } + @Override public HoodieMergedLogRecordScanner build() { if (this.partitionName == null && CollectionUtils.nonEmpty(this.logFilePaths)) { this.partitionName = getRelativePartitionPath(new Path(basePath), new Path(this.logFilePaths.get(0)).getParent()); } + ValidationUtils.checkArgument(recordMerger != null); + return new HoodieMergedLogRecordScanner(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, readBlocksLazily, reverseReader, bufferSize, spillableMapBasePath, instantRange, diskMapType, isBitCaskDiskMapCompressionEnabled, withOperationField, true, - Option.ofNullable(partitionName), internalSchema); + Option.ofNullable(partitionName), internalSchema, recordMerger); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index 8ea34d6f2fa0d..c29f23192109d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -20,8 +20,9 @@ import org.apache.hudi.common.model.DeleteRecord; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; @@ -37,8 +38,8 @@ public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordReade private HoodieUnMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize, - LogRecordScannerCallback callback, Option instantRange) { - super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, instantRange, false); + LogRecordScannerCallback callback, Option instantRange, HoodieRecordMerger recordMerger) { + super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, instantRange, false, recordMerger); this.callback = callback; } @@ -50,7 +51,7 @@ public static HoodieUnMergedLogRecordScanner.Builder newBuilder() { } @Override - protected void processNextRecord(HoodieRecord hoodieRecord) throws Exception { + protected void processNextRecord(HoodieRecord hoodieRecord) throws Exception { // Just call callback without merging callback.apply(hoodieRecord); } @@ -66,7 +67,7 @@ protected void processNextDeletedRecord(DeleteRecord deleteRecord) { @FunctionalInterface public interface LogRecordScannerCallback { - void apply(HoodieRecord record) throws Exception; + void apply(HoodieRecord record) throws Exception; } /** @@ -84,6 +85,7 @@ public static class Builder extends AbstractHoodieLogRecordReader.Builder { private Option instantRange = Option.empty(); // specific configurations private LogRecordScannerCallback callback; + private HoodieRecordMerger recordMerger; public Builder withFileSystem(FileSystem fs) { this.fs = fs; @@ -135,10 +137,18 @@ public Builder withLogRecordScannerCallback(LogRecordScannerCallback callback) { return this; } + @Override + public Builder withRecordMerger(HoodieRecordMerger recordMerger) { + this.recordMerger = recordMerger; + return this; + } + @Override public HoodieUnMergedLogRecordScanner build() { + ValidationUtils.checkArgument(recordMerger != null); + return new HoodieUnMergedLogRecordScanner(fs, basePath, logFilePaths, readerSchema, - latestInstantTime, readBlocksLazily, reverseReader, bufferSize, callback, instantRange); + latestInstantTime, readBlocksLazily, reverseReader, bufferSize, callback, instantRange, recordMerger); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java index 2712d000ac285..9c6135dd28f72 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java @@ -18,6 +18,16 @@ package org.apache.hudi.common.table.log.block; +import org.apache.hudi.common.fs.SizeAwareDataInputStream; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.MappingIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.internal.schema.InternalSchema; + import org.apache.avro.Schema; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; @@ -29,13 +39,6 @@ import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hudi.common.fs.SizeAwareDataInputStream; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.ClosableIterator; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.internal.schema.InternalSchema; import javax.annotation.Nonnull; @@ -54,10 +57,10 @@ import java.util.List; import java.util.Map; import java.util.Properties; -import java.util.stream.Collectors; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; import static org.apache.hudi.common.util.ValidationUtils.checkState; /** @@ -115,13 +118,14 @@ protected byte[] serializeRecords(List records) throws IOException output.writeInt(records.size()); // 3. Write the records - for (HoodieRecord s : records) { + for (HoodieRecord s : records) { ByteArrayOutputStream temp = new ByteArrayOutputStream(); BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(temp, encoderCache.get()); encoderCache.set(encoder); try { // Encode the record into bytes - IndexedRecord data = (IndexedRecord) s.toIndexedRecord(schema, new Properties()).get(); + // Spark Record not support write avro log + IndexedRecord data = s.toIndexedRecord(schema, new Properties()).get().getData(); writer.write(data, encoder); encoder.flush(); @@ -142,25 +146,25 @@ protected byte[] serializeRecords(List records) throws IOException // TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used @Override - protected ClosableIterator deserializeRecords(byte[] content, HoodieRecord.Mapper mapper) throws IOException { + protected ClosableIterator> deserializeRecords(byte[] content, HoodieRecordType type) throws IOException { checkState(this.readerSchema != null, "Reader's schema has to be non-null"); - return RecordIterator.getInstance(this, content, internalSchema, mapper); + checkArgument(type != HoodieRecordType.SPARK, "Not support read avro to spark record"); + // TODO AvroSparkReader need + RecordIterator iterator = RecordIterator.getInstance(this, content, internalSchema); + return new MappingIterator<>(iterator, data -> (HoodieRecord) new HoodieAvroIndexedRecord(data)); } - public static class RecordIterator implements ClosableIterator { + private static class RecordIterator implements ClosableIterator { private byte[] content; private final SizeAwareDataInputStream dis; private final GenericDatumReader reader; private final ThreadLocal decoderCache = new ThreadLocal<>(); - private final HoodieRecord.Mapper mapper; - private final Schema finalReadSchema; private int totalRecords = 0; private int readRecords = 0; - private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content, InternalSchema internalSchema, HoodieRecord.Mapper mapper) throws IOException { + private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content, InternalSchema internalSchema) throws IOException { this.content = content; - this.mapper = mapper; this.dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(this.content))); @@ -177,7 +181,6 @@ private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content, finalReadSchema = writerSchema; } - this.finalReadSchema = finalReadSchema; this.reader = new GenericDatumReader<>(writerSchema, finalReadSchema); if (logBlockVersion.hasRecordCount()) { @@ -185,14 +188,10 @@ private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content, } } - public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content, InternalSchema internalSchema, HoodieRecord.Mapper mapper) throws IOException { + public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content, InternalSchema internalSchema) throws IOException { // Get schema from the header Schema writerSchema = new Schema.Parser().parse(dataBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); - return new RecordIterator(dataBlock.readerSchema, writerSchema, content, internalSchema, mapper); - } - - public Schema getFinalReadSchema() { - return finalReadSchema; + return new RecordIterator(dataBlock.readerSchema, writerSchema, content, internalSchema); } @Override @@ -212,7 +211,7 @@ public boolean hasNext() { } @Override - public HoodieRecord next() { + public IndexedRecord next() { try { int recordLength = this.dis.readInt(); BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(this.content, this.dis.getNumberOfBytesRead(), @@ -221,7 +220,7 @@ public HoodieRecord next() { IndexedRecord record = this.reader.read(null, decoder); this.dis.skipBytes(recordLength); this.readRecords++; - return mapper.apply(record); + return record; } catch (IOException e) { throw new HoodieIOException("Unable to convert bytes to record.", e); } @@ -274,18 +273,18 @@ public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema, GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); // 2. Get the total records int totalRecords = dis.readInt(); - List records = new ArrayList<>(totalRecords); + List records = new ArrayList<>(totalRecords); // 3. Read the content for (int i = 0; i < totalRecords; i++) { int recordLength = dis.readInt(); Decoder decoder = DecoderFactory.get().binaryDecoder(content, dis.getNumberOfBytesRead(), recordLength, null); IndexedRecord record = reader.read(null, decoder); - records.add(record); + records.add(new HoodieAvroIndexedRecord(record)); dis.skipBytes(recordLength); } dis.close(); - return new HoodieAvroDataBlock(records.stream().map(HoodieAvroIndexedRecord::new).collect(Collectors.toList()), readerSchema); + return new HoodieAvroDataBlock(records, readerSchema); } private static byte[] compress(String text) { @@ -327,8 +326,8 @@ public byte[] getBytes(Schema schema) throws IOException { output.writeInt(schemaContent.length); output.write(schemaContent); - List records = new ArrayList<>(); - try (ClosableIterator recordItr = getRecordIterator(HoodieAvroIndexedRecord::new)) { + List> records = new ArrayList<>(); + try (ClosableIterator> recordItr = getRecordIterator(HoodieRecordType.AVRO)) { recordItr.forEachRemaining(records::add); } @@ -336,9 +335,9 @@ public byte[] getBytes(Schema schema) throws IOException { output.writeInt(records.size()); // 3. Write the records - Iterator itr = records.iterator(); + Iterator> itr = records.iterator(); while (itr.hasNext()) { - IndexedRecord s = (IndexedRecord)itr.next().getData(); + IndexedRecord s = itr.next().toIndexedRecord(schema, new Properties()).get().getData(); ByteArrayOutputStream temp = new ByteArrayOutputStream(); Encoder encoder = EncoderFactory.get().binaryEncoder(temp, null); try { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java index d88fff750f47a..ab669730f602a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.table.log.block; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; @@ -35,6 +36,7 @@ import java.util.Set; import java.util.function.Function; +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; import static org.apache.hudi.common.util.ValidationUtils.checkState; /** @@ -138,13 +140,14 @@ protected static Schema getWriterSchema(Map logBlock /** * Returns all the records iterator contained w/in this block. */ - public final ClosableIterator getRecordIterator(HoodieRecord.Mapper mapper) { + public final ClosableIterator> getRecordIterator(HoodieRecordType type) { if (records.isPresent()) { - return list2Iterator(records.get()); + // TODO need convert record type + return list2Iterator(unsafeCast(records.get())); } try { // in case records are absent, read content lazily and then convert to IndexedRecords - return readRecordsFromBlockPayload(mapper); + return readRecordsFromBlockPayload(type); } catch (IOException io) { throw new HoodieIOException("Unable to convert content bytes to records", io); } @@ -162,38 +165,38 @@ public Schema getSchema() { * @return List of IndexedRecords for the keys of interest. * @throws IOException in case of failures encountered when reading/parsing records */ - public final ClosableIterator getRecordIterator(List keys, boolean fullKey, HoodieRecord.Mapper mapper) throws IOException { + public final ClosableIterator> getRecordIterator(List keys, boolean fullKey, HoodieRecordType type) throws IOException { boolean fullScan = keys.isEmpty(); if (enablePointLookups && !fullScan) { - return lookupRecords(keys, fullKey, mapper); + return lookupRecords(keys, fullKey); } // Otherwise, we fetch all the records and filter out all the records, but the // ones requested - ClosableIterator allRecords = getRecordIterator(mapper); + ClosableIterator> allRecords = getRecordIterator(type); if (fullScan) { return allRecords; } HashSet keySet = new HashSet<>(keys); - return FilteringIterator.getInstance(allRecords, keySet, fullKey, this::getRecordKey); + return FilteringIterator.getInstance(allRecords, keySet, fullKey, record -> getRecordKey(record)); } - protected ClosableIterator readRecordsFromBlockPayload(HoodieRecord.Mapper mapper) throws IOException { + protected ClosableIterator> readRecordsFromBlockPayload(HoodieRecordType type) throws IOException { if (readBlockLazily && !getContent().isPresent()) { // read log block contents from disk inflate(); } try { - return deserializeRecords(getContent().get(), mapper); + return deserializeRecords(getContent().get(), type); } finally { // Free up content to be GC'd by deflating the block deflate(); } } - protected ClosableIterator lookupRecords(List keys, boolean fullKey, HoodieRecord.Mapper mapper) throws IOException { + protected ClosableIterator> lookupRecords(List keys, boolean fullKey) throws IOException { throw new UnsupportedOperationException( String.format("Point lookups are not supported by this Data block type (%s)", getBlockType()) ); @@ -201,7 +204,7 @@ protected ClosableIterator lookupRecords(List keys, boolea protected abstract byte[] serializeRecords(List records) throws IOException; - protected abstract ClosableIterator deserializeRecords(byte[] content, HoodieRecord.Mapper mapper) throws IOException; + protected abstract ClosableIterator> deserializeRecords(byte[] content, HoodieRecordType type) throws IOException; public abstract HoodieLogBlockType getBlockType(); @@ -244,29 +247,29 @@ public T next() { * A {@link ClosableIterator} that supports filtering strategy with given keys. * User should supply the key extraction function for fetching string format keys. */ - private static class FilteringIterator implements ClosableIterator { - private final ClosableIterator nested; // nested iterator + private static class FilteringIterator implements ClosableIterator> { + private final ClosableIterator> nested; // nested iterator private final Set keys; // the filtering keys private final boolean fullKey; - private final Function> keyExtract; // function to extract the key + private final Function, Option> keyExtract; // function to extract the key - private HoodieRecord next; + private HoodieRecord next; - private FilteringIterator(ClosableIterator nested, Set keys, boolean fullKey, Function> keyExtract) { + private FilteringIterator(ClosableIterator> nested, Set keys, boolean fullKey, Function, Option> keyExtract) { this.nested = nested; this.keys = keys; this.fullKey = fullKey; this.keyExtract = keyExtract; } - public static FilteringIterator getInstance( - ClosableIterator nested, + public static FilteringIterator getInstance( + ClosableIterator> nested, Set keys, boolean fullKey, - Function> keyExtract) { - return new FilteringIterator(nested, keys, fullKey, keyExtract); + Function, Option> keyExtract) { + return new FilteringIterator<>(nested, keys, fullKey, keyExtract); } @Override @@ -292,7 +295,7 @@ public boolean hasNext() { } @Override - public HoodieRecord next() { + public HoodieRecord next() { return this.next; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java index ca2c46022cd11..37db0caa9cf44 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java @@ -33,8 +33,14 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.inline.InLineFSUtils; import org.apache.hudi.common.fs.inline.InLineFileSystem; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockContentLocation; +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.MappingIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; @@ -56,6 +62,7 @@ import java.util.TreeMap; import static org.apache.hudi.common.util.ValidationUtils.checkState; +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; /** * HoodieHFileDataBlock contains a list of records stored inside an HFile format. It is used with the HFile @@ -125,7 +132,7 @@ protected byte[] serializeRecords(List records) throws IOException Iterator itr = records.iterator(); int id = 0; while (itr.hasNext()) { - HoodieRecord record = itr.next(); + HoodieRecord record = itr.next(); String recordKey; if (useIntegerKey) { recordKey = String.format("%" + keyWidth + "s", id++); @@ -162,7 +169,7 @@ protected byte[] serializeRecords(List records) throws IOException } @Override - protected ClosableIterator deserializeRecords(byte[] content, HoodieRecord.Mapper mapper) throws IOException { + protected ClosableIterator> deserializeRecords(byte[] content, HoodieRecordType type) throws IOException { checkState(readerSchema != null, "Reader's schema has to be non-null"); // Get schema from the header @@ -170,28 +177,12 @@ protected ClosableIterator deserializeRecords(byte[] content, Hood // Read the content HoodieAvroHFileReader reader = new HoodieAvroHFileReader(null, pathForReader, content, Option.of(writerSchema)); - Iterator recordIterator = reader.getRecordIterator(readerSchema); - return new ClosableIterator() { - @Override - public void close() { - reader.close(); - } - - @Override - public boolean hasNext() { - return recordIterator.hasNext(); - } - - @Override - public HoodieRecord next() { - return mapper.apply(recordIterator.next()); - } - }; + return unsafeCast(reader.getRecordIterator(readerSchema)); } // TODO abstract this w/in HoodieDataBlock @Override - protected ClosableIterator lookupRecords(List keys, boolean fullKey, HoodieRecord.Mapper mapper) throws IOException { + protected ClosableIterator> lookupRecords(List keys, boolean fullKey) throws IOException { HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get(); // NOTE: It's important to extend Hadoop configuration here to make sure configuration @@ -216,32 +207,16 @@ protected ClosableIterator lookupRecords(List keys, boolea // Get writer's schema from the header final ClosableIterator recordIterator = - fullKey ? reader.getRecordsByKeysIterator(sortedKeys, readerSchema) : reader.getRecordsByKeyPrefixIterator(sortedKeys, readerSchema); + fullKey ? reader.getIndexedRecordsByKeysIterator(sortedKeys, readerSchema) : reader.getIndexedRecordsByKeyPrefixIterator(sortedKeys, readerSchema); - return new ClosableIterator() { - @Override - public boolean hasNext() { - return recordIterator.hasNext(); - } - - @Override - public HoodieRecord next() { - return mapper.apply(recordIterator.next()); - } - - @Override - public void close() { - recordIterator.close(); - reader.close(); - } - }; + return new MappingIterator<>(recordIterator, data -> (HoodieRecord) new HoodieAvroIndexedRecord((data))); } - private byte[] serializeRecord(HoodieRecord record, Schema schema) throws IOException { + private byte[] serializeRecord(HoodieRecord record, Schema schema) throws IOException { Option keyField = getKeyField(schema); // Reset key value w/in the record to avoid duplicating the key w/in payload if (keyField.isPresent()) { - record.overrideMetadataFieldValue(schema, new Properties(), keyField.get().pos(), StringUtils.EMPTY_STRING); + record.updateValues(schema, new Properties(), Collections.singletonMap(keyField.get().name(), StringUtils.EMPTY_STRING)); } return HoodieAvroUtils.recordToBytes(record, schema).get(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java index 98f5dcf3a073a..79d3b3ec86c3e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java @@ -18,30 +18,25 @@ package org.apache.hudi.common.table.log.block; -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.fs.inline.InLineFSUtils; import org.apache.hudi.common.fs.inline.InLineFileSystem; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ParquetReaderIterator; -import org.apache.hudi.io.storage.HoodieAvroFileReader.HoodieRecordTransformIterator; -import org.apache.hudi.io.storage.HoodieParquetConfig; -import org.apache.hudi.io.storage.HoodieParquetStreamWriter; -import org.apache.parquet.avro.AvroParquetReader; -import org.apache.parquet.avro.AvroReadSupport; -import org.apache.parquet.avro.AvroSchemaConverter; -import org.apache.parquet.hadoop.ParquetReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.io.storage.HoodieFileWriter; +import org.apache.hudi.io.storage.HoodieFileWriterFactory; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.apache.parquet.hadoop.util.HadoopInputFile; -import org.apache.parquet.io.InputFile; import javax.annotation.Nonnull; import java.io.ByteArrayOutputStream; @@ -50,6 +45,14 @@ import java.util.List; import java.util.Map; +import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_BLOCK_SIZE; +import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME; +import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION; +import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_MAX_FILE_SIZE; +import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_PAGE_SIZE; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; +import static org.apache.hudi.common.table.HoodieTableConfig.POPULATE_META_FIELDS; + /** * HoodieParquetDataBlock contains a list of records serialized using Parquet. */ @@ -93,53 +96,47 @@ protected byte[] serializeRecords(List records) throws IOException } Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); - - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( - new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty()); - - HoodieParquetConfig avroParquetConfig = - new HoodieParquetConfig<>( - writeSupport, - compressionCodecName.get(), - ParquetWriter.DEFAULT_BLOCK_SIZE, - ParquetWriter.DEFAULT_PAGE_SIZE, - 1024 * 1024 * 1024, - new Configuration(), - Double.parseDouble(String.valueOf(0.1)));//HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue())); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - - try (FSDataOutputStream outputStream = new FSDataOutputStream(baos)) { - try (HoodieParquetStreamWriter parquetWriter = new HoodieParquetStreamWriter(outputStream, avroParquetConfig)) { - for (HoodieRecord record : records) { + try (FSDataOutputStream outputStream = new FSDataOutputStream(baos, null)) { + HoodieFileWriter parquetWriter = null; + HoodieConfig config = new HoodieConfig(); + config.setValue(PARQUET_COMPRESSION_CODEC_NAME.key(), compressionCodecName.get().name()); + config.setValue(PARQUET_BLOCK_SIZE.key(), String.valueOf(ParquetWriter.DEFAULT_BLOCK_SIZE)); + config.setValue(PARQUET_PAGE_SIZE.key(), String.valueOf(ParquetWriter.DEFAULT_PAGE_SIZE)); + config.setValue(PARQUET_MAX_FILE_SIZE.key(), String.valueOf(1024 * 1024 * 1024)); + config.setValue(PARQUET_COMPRESSION_RATIO_FRACTION.key(), String.valueOf(0.1)); + config.setValue(POPULATE_META_FIELDS.key(), "false"); + HoodieRecordType recordType = records.iterator().next().getRecordType(); + try { + parquetWriter = HoodieFileWriterFactory.getFileWriter( + HoodieFileFormat.PARQUET, + outputStream, + new Configuration(), + config, + writerSchema, + recordType); + for (HoodieRecord record : records) { String recordKey = getRecordKey(record).orElse(null); parquetWriter.write(recordKey, record, writerSchema); } outputStream.flush(); + } finally { + if (parquetWriter != null) { + parquetWriter.close(); + } } } return baos.toByteArray(); } - public static ClosableIterator getProjectedParquetRecordsIterator(Configuration conf, - Schema readerSchema, - InputFile inputFile) throws IOException { - AvroReadSupport.setAvroReadSchema(conf, readerSchema); - AvroReadSupport.setRequestedProjection(conf, readerSchema); - - ParquetReader reader = - AvroParquetReader.builder(inputFile).withConf(conf).build(); - return new ParquetReaderIterator<>(reader); - } - /** * NOTE: We're overriding the whole reading sequence to make sure we properly respect * the requested Reader's schema and only fetch the columns that have been explicitly * requested by the caller (providing projected Reader's schema) */ @Override - protected ClosableIterator readRecordsFromBlockPayload(HoodieRecord.Mapper mapper) throws IOException { + protected ClosableIterator> readRecordsFromBlockPayload(HoodieRecordType type) throws IOException { HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get(); // NOTE: It's important to extend Hadoop configuration here to make sure configuration @@ -153,15 +150,13 @@ protected ClosableIterator readRecordsFromBlockPayload(HoodieRecor blockContentLoc.getContentPositionInLogFile(), blockContentLoc.getBlockSize()); - ClosableIterator iterator = getProjectedParquetRecordsIterator( - inlineConf, - readerSchema, - HadoopInputFile.fromPath(inlineLogFilePath, inlineConf)); - return new HoodieRecordTransformIterator(iterator, mapper); + ClosableIterator> iterator = HoodieFileReaderFactory.getReaderFactory(type).getFileReader(inlineConf, inlineLogFilePath, PARQUET) + .getRecordIterator(readerSchema, readerSchema); + return iterator; } @Override - protected ClosableIterator deserializeRecords(byte[] content, HoodieRecord.Mapper mapper) throws IOException { + protected ClosableIterator> deserializeRecords(byte[] content, HoodieRecordType type) throws IOException { throw new UnsupportedOperationException("Should not be invoked"); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java index 22dd2b7ee3f74..3ed6e8d2d5b9f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java @@ -22,10 +22,10 @@ import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; import org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan; import org.apache.hudi.common.fs.HoodieWrapperFileSystem; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; @@ -259,12 +259,12 @@ private List loadInstants(TimeRangeFilter filter, boolean loadIns HoodieAvroDataBlock avroBlock = (HoodieAvroDataBlock) block; // TODO If we can store additional metadata in datablock, we can skip parsing records // (such as startTime, endTime of records in the block) - try (ClosableIterator itr = avroBlock.getRecordIterator(HoodieAvroIndexedRecord::new)) { + try (ClosableIterator> itr = avroBlock.getRecordIterator(HoodieRecordType.AVRO)) { StreamSupport.stream(Spliterators.spliteratorUnknownSize(itr, Spliterator.IMMUTABLE), true) // Filter blocks in desired time window - .map(r -> (GenericRecord) ((HoodieAvroIndexedRecord) r).toIndexedRecord().get()) + .map(r -> (GenericRecord) r.getData()) .filter(commitsFilter::apply) - .map(r -> readCommit((GenericRecord) r, loadInstantDetails)) + .map(r -> readCommit(r, loadInstantDetails)) .filter(c -> filter == null || filter.isInRange(c)) .forEach(instantsInRange::add); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ConfigUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ConfigUtils.java new file mode 100644 index 0000000000000..60aa6c43a981c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ConfigUtils.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.hudi.common.model.HoodiePayloadProps; +import org.apache.hudi.common.table.HoodieTableConfig; + +import java.util.Properties; + +public class ConfigUtils { + + /** + * Get ordering field. + */ + public static String getOrderingField(Properties properties) { + String orderField = null; + if (properties.containsKey(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY)) { + orderField = properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY); + } else if (properties.containsKey("hoodie.datasource.write.precombine.field")) { + orderField = properties.getProperty("hoodie.datasource.write.precombine.field"); + } else if (properties.containsKey(HoodieTableConfig.PRECOMBINE_FIELD.key())) { + orderField = properties.getProperty(HoodieTableConfig.PRECOMBINE_FIELD.key()); + } + return orderField; + } + + /** + * Get payload class. + */ + public static String getPayloadClass(Properties properties) { + String payloadClass = null; + if (properties.containsKey(HoodieTableConfig.PAYLOAD_CLASS_NAME.key())) { + payloadClass = properties.getProperty(HoodieTableConfig.PAYLOAD_CLASS_NAME.key()); + } else if (properties.containsKey("hoodie.datasource.write.payload.class")) { + payloadClass = properties.getProperty("hoodie.datasource.write.payload.class"); + } + return payloadClass; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java index feebf6c48224a..32955abbf0c44 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java @@ -19,21 +19,18 @@ package org.apache.hudi.common.util; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.avro.Schema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.Serializable; - /** * Size Estimator for Hoodie record payload. * * @param */ -public class HoodieRecordSizeEstimator implements SizeEstimator>, Serializable { +public class HoodieRecordSizeEstimator implements SizeEstimator> { private static final Logger LOG = LogManager.getLogger(HoodieRecordSizeEstimator.class); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordUtils.java index 075d117fe2120..92b14ff340b5e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordUtils.java @@ -18,13 +18,20 @@ package org.apache.hudi.common.util; -import org.apache.hudi.common.model.HoodieMerge; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.exception.HoodieException; - import java.lang.reflect.InvocationTargetException; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Objects; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.model.HoodieRecordMerger; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** * A utility class for HoodieRecord. @@ -32,37 +39,74 @@ public class HoodieRecordUtils { private static final Map INSTANCE_CACHE = new HashMap<>(); + private static final Logger LOG = LogManager.getLogger(HoodieRecordUtils.class); /** * Instantiate a given class with a record merge. */ - public static HoodieMerge loadMerge(String mergeClass) { + public static HoodieRecordMerger loadRecordMerger(String mergerClass) { try { - HoodieMerge merge = (HoodieMerge) INSTANCE_CACHE.get(mergeClass); - if (null == merge) { - synchronized (HoodieMerge.class) { - merge = (HoodieMerge) INSTANCE_CACHE.get(mergeClass); - if (null == merge) { - merge = (HoodieMerge)ReflectionUtils.loadClass(mergeClass, new Object[]{}); - INSTANCE_CACHE.put(mergeClass, merge); + HoodieRecordMerger recordMerger = (HoodieRecordMerger) INSTANCE_CACHE.get(mergerClass); + if (null == recordMerger) { + synchronized (HoodieRecordMerger.class) { + recordMerger = (HoodieRecordMerger) INSTANCE_CACHE.get(mergerClass); + if (null == recordMerger) { + recordMerger = (HoodieRecordMerger) ReflectionUtils.loadClass(mergerClass, + new Object[]{}); + INSTANCE_CACHE.put(mergerClass, recordMerger); } } } - return merge; + return recordMerger; } catch (HoodieException e) { throw new HoodieException("Unable to instantiate hoodie merge class ", e); } } + /** + * Instantiate a given class with a record merge. + */ + public static HoodieRecordMerger generateRecordMerger(String basePath, EngineType engineType, + List mergerClassList, String mergerStrategy) { + if (mergerClassList.isEmpty() || HoodieTableMetadata.isMetadataTable(basePath)) { + return HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName()); + } else { + return mergerClassList.stream() + .map(clazz -> { + try { + return loadRecordMerger(clazz); + } catch (HoodieException e) { + LOG.warn(String.format("Unable to init %s", clazz), e); + return null; + } + }) + .filter(Objects::nonNull) + .filter(merger -> merger.getMergingStrategy().equals(mergerStrategy)) + .filter(merger -> recordTypeCompatibleEngine(merger.getRecordType(), engineType)) + .findFirst() + .orElse(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())); + } + } + /** * Instantiate a given class with an avro record payload. */ - public static T loadPayload(String recordPayloadClass, Object[] payloadArgs, - Class... constructorArgTypes) { + public static T loadPayload(String recordPayloadClass, + Object[] payloadArgs, + Class... constructorArgTypes) { try { - return (T) ReflectionUtils.getClass(recordPayloadClass).getConstructor(constructorArgTypes).newInstance(payloadArgs); + return (T) ReflectionUtils.getClass(recordPayloadClass).getConstructor(constructorArgTypes) + .newInstance(payloadArgs); } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { throw new HoodieException("Unable to instantiate payload class ", e); } } + + public static boolean recordTypeCompatibleEngine(HoodieRecordType recordType, EngineType engineType) { + if (engineType == EngineType.SPARK && recordType == HoodieRecordType.SPARK) { + return true; + } else { + return false; + } + } } \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java index 03bd471b606f1..ca8c3ab428768 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java @@ -24,6 +24,7 @@ import org.apache.parquet.hadoop.ParquetReader; import java.io.IOException; +import java.util.function.Function; /** * This class wraps a parquet reader and provides an iterator based api to read from a parquet file. This is used in @@ -35,17 +36,24 @@ public class ParquetReaderIterator implements ClosableIterator { private final ParquetReader parquetReader; // Holds the next entry returned by the parquet reader private T next; + // For directly use InternalRow + private Function mapper; public ParquetReaderIterator(ParquetReader parquetReader) { this.parquetReader = parquetReader; } + public ParquetReaderIterator(ParquetReader parquetReader, Function mapper) { + this.parquetReader = parquetReader; + this.mapper = mapper; + } + @Override public boolean hasNext() { try { // To handle when hasNext() is called multiple times for idempotency and/or the first time if (this.next == null) { - this.next = parquetReader.read(); + this.next = read(); } return this.next != null; } catch (Exception e) { @@ -64,7 +72,7 @@ public T next() { } } T retVal = this.next; - this.next = parquetReader.read(); + this.next = read(); return retVal; } catch (Exception e) { FileIOUtils.closeQuietly(parquetReader); @@ -72,6 +80,15 @@ public T next() { } } + private T read() throws IOException { + T record = parquetReader.read(); + if (mapper == null || record == null) { + return record; + } else { + return mapper.apply(record); + } + } + public void close() { try { parquetReader.close(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java index 9041db5144458..6f3b3ef928200 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java @@ -19,8 +19,11 @@ package org.apache.hudi.common.util; import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.Serializer; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; import org.objenesis.strategy.StdInstantiatorStrategy; import java.io.ByteArrayOutputStream; @@ -36,6 +39,14 @@ public class SerializationUtils { private static final ThreadLocal SERIALIZER_REF = ThreadLocal.withInitial(KryoSerializerInstance::new); + private static Pair> SERIALIZER_REGISTER = null; + + public static void setOverallRegister(String className, Serializer serializer) { + if (SERIALIZER_REGISTER == null) { + SERIALIZER_REGISTER = Pair.of(className, serializer); + } + } + // Serialize // ----------------------------------------------------------------------- @@ -121,6 +132,13 @@ public Kryo newKryo() { // Handle cases where we may have an odd classloader setup like with libjars // for hadoop kryo.setClassLoader(Thread.currentThread().getContextClassLoader()); + if (SERIALIZER_REGISTER != null) { + try { + kryo.register(Class.forName(SERIALIZER_REGISTER.getLeft()), SERIALIZER_REGISTER.getRight()); + } catch (ClassNotFoundException e) { + throw new HoodieException(e); + } + } return kryo; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java index d2d91bbfb6e11..d8cf85cfd4533 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java @@ -106,13 +106,13 @@ public static long computePayloadSize(R value, SizeEstimator valueSizeEst /** * Utility method to convert bytes to HoodieRecord using schema and payload class. */ - public static R convertToHoodieRecordPayload(GenericRecord rec, String payloadClazz, String preCombineField, boolean withOperationField) { + public static HoodieRecord convertToHoodieRecordPayload(GenericRecord rec, String payloadClazz, String preCombineField, boolean withOperationField) { return convertToHoodieRecordPayload(rec, payloadClazz, preCombineField, Pair.of(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.PARTITION_PATH_METADATA_FIELD), withOperationField, Option.empty()); } - public static R convertToHoodieRecordPayload(GenericRecord record, String payloadClazz, + public static HoodieRecord convertToHoodieRecordPayload(GenericRecord record, String payloadClazz, String preCombineField, boolean withOperationField, Option partitionName) { @@ -124,7 +124,7 @@ public static R convertToHoodieRecordPayload(GenericRecord record, String pa /** * Utility method to convert bytes to HoodieRecord using schema and payload class. */ - public static R convertToHoodieRecordPayload(GenericRecord record, String payloadClazz, + public static HoodieRecord convertToHoodieRecordPayload(GenericRecord record, String payloadClazz, String preCombineField, Pair recordKeyPartitionPathFieldPair, boolean withOperationField, @@ -140,7 +140,7 @@ public static R convertToHoodieRecordPayload(GenericRecord record, String pa HoodieRecordUtils.loadPayload(payloadClazz, new Object[]{record, preCombineVal}, GenericRecord.class, Comparable.class), operation); - return (R) hoodieRecord; + return (HoodieRecord) hoodieRecord; } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java index a4f2c62437caa..9499954911b6e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java @@ -33,6 +33,8 @@ public class StringUtils { public static final String EMPTY_STRING = ""; + public static final String DEFAULT_MERGER_STRATEGY_UUID = "eeb8d96f-b1e4-49fd-bbf8-28ac514178e5"; + /** *

* Joins the elements of the provided array into a single String containing the provided list of elements. diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReader.java index f454fa28f1929..c2ddfa319fae9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReader.java @@ -18,85 +18,71 @@ package org.apache.hudi.io.storage; -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; - +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecord.Mapper; import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.MappingIterator; import org.apache.hudi.common.util.Option; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; + import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.List; -public interface HoodieAvroFileReader extends HoodieFileReader, AutoCloseable { +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; + +public interface HoodieAvroFileReader extends HoodieFileReader, AutoCloseable { + + ClosableIterator getIndexedRecordIterator(Schema readerSchema) throws IOException; - ClosableIterator getRecordIterator(Schema readerSchema) throws IOException; + ClosableIterator getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException; - default Option getRecordByKey(String key, Schema readerSchema) throws IOException { + default Option getIndexedRecordByKey(String key, Schema readerSchema) throws IOException { throw new UnsupportedOperationException(); } - default ClosableIterator getRecordsByKeysIterator(List keys, Schema schema) throws IOException { + default ClosableIterator getIndexedRecordsByKeysIterator(List keys, Schema schema) throws IOException { throw new UnsupportedOperationException(); } - default ClosableIterator getRecordsByKeysIterator(List keys) throws IOException { - return getRecordsByKeysIterator(keys, getSchema()); + default ClosableIterator getIndexedRecordsByKeysIterator(List keys) throws IOException { + return getIndexedRecordsByKeysIterator(keys, getSchema()); } - default ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes, Schema schema) throws IOException { + default ClosableIterator getIndexedRecordsByKeyPrefixIterator(List keyPrefixes, Schema schema) throws IOException { throw new UnsupportedEncodingException(); } - default ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes) throws IOException { - return getRecordsByKeyPrefixIterator(keyPrefixes, getSchema()); + default ClosableIterator getIndexedRecordsByKeyPrefixIterator(List keyPrefixes) throws IOException { + return getIndexedRecordsByKeyPrefixIterator(keyPrefixes, getSchema()); } - default ClosableIterator getRecordsByKeysIterator(List keys, Schema schema, HoodieRecord.Mapper mapper) throws IOException { - ClosableIterator iterator = getRecordsByKeysIterator(keys, schema); - return new HoodieRecordTransformIterator(iterator, mapper); + default ClosableIterator> getRecordsByKeysIterator(List keys, Schema schema) throws IOException { + ClosableIterator iterator = getIndexedRecordsByKeysIterator(keys, schema); + return new MappingIterator<>(iterator, data -> unsafeCast(new HoodieAvroIndexedRecord(data))); } - default ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes, Schema schema, HoodieRecord.Mapper mapper) throws IOException { - ClosableIterator iterator = getRecordsByKeyPrefixIterator(keyPrefixes, schema); - return new HoodieRecordTransformIterator(iterator, mapper); + default ClosableIterator> getRecordsByKeyPrefixIterator(List keyPrefixes, Schema schema) throws IOException { + ClosableIterator iterator = getIndexedRecordsByKeyPrefixIterator(keyPrefixes, schema); + return new MappingIterator<>(iterator, data -> unsafeCast(new HoodieAvroIndexedRecord(data))); } @Override - default ClosableIterator getRecordIterator(Schema schema, HoodieRecord.Mapper mapper) throws IOException { - return new MappingIterator<>(getRecordIterator(schema), mapper::apply); + default ClosableIterator> getRecordIterator(Schema schema) throws IOException { + ClosableIterator iterator = getIndexedRecordIterator(schema); + return new MappingIterator<>(iterator, data -> unsafeCast(new HoodieAvroIndexedRecord(data))); } @Override - default Option getRecordByKey(String key, Schema readerSchema, HoodieRecord.Mapper mapper) throws IOException { - return getRecordByKey(key, readerSchema).map(mapper::apply); + default ClosableIterator> getRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException { + ClosableIterator iterator = getIndexedRecordIterator(readerSchema, requestedSchema); + return new MappingIterator<>(iterator, data -> unsafeCast(new HoodieAvroIndexedRecord(data))); } - class HoodieRecordTransformIterator implements ClosableIterator { - private final ClosableIterator dataIterator; - private final HoodieRecord.Mapper mapper; - - public HoodieRecordTransformIterator(ClosableIterator dataIterator, Mapper mapper) { - this.dataIterator = dataIterator; - this.mapper = mapper; - } - - @Override - public boolean hasNext() { - return dataIterator.hasNext(); - } - - @Override - public HoodieRecord next() { - return mapper.apply(dataIterator.next()); - } - - @Override - public void close() { - dataIterator.close(); - } + default Option> getRecordByKey(String key, Schema readerSchema) throws IOException { + return getIndexedRecordByKey(key, readerSchema) + .map(data -> unsafeCast(new HoodieAvroIndexedRecord(data))); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java new file mode 100644 index 0000000000000..a194e2fc6afe9 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; + +import java.io.IOException; + +public class HoodieAvroFileReaderFactory extends HoodieFileReaderFactory { + + private static class SingletonHolder { + private static final HoodieAvroFileReaderFactory INSTANCE = new HoodieAvroFileReaderFactory(); + } + + private HoodieAvroFileReaderFactory() { + } + + public static HoodieFileReaderFactory getFileReaderFactory() { + return SingletonHolder.INSTANCE; + } + + protected HoodieFileReader newParquetFileReader(Configuration conf, Path path) { + return new HoodieAvroParquetReader(conf, path); + } + + protected HoodieFileReader newHFileFileReader(Configuration conf, Path path) throws IOException { + CacheConfig cacheConfig = new CacheConfig(conf); + return new HoodieAvroHFileReader(conf, path, cacheConfig); + } + + @Override + protected HoodieFileReader newOrcFileReader(Configuration conf, Path path) { + return new HoodieAvroOrcReader(conf, path); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriter.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriter.java index 947b83fe8600c..bb046c2395f1b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriter.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriter.java @@ -19,6 +19,7 @@ package org.apache.hudi.io.storage; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -41,13 +42,13 @@ public interface HoodieAvroFileWriter extends HoodieFileWriter { @Override default void writeWithMetadata(HoodieKey key, HoodieRecord record, Schema schema, Properties props) throws IOException { - IndexedRecord avroPayload = (IndexedRecord)record.toIndexedRecord(schema, props).get(); + IndexedRecord avroPayload = ((HoodieAvroIndexedRecord) record.toIndexedRecord(schema, props).get()).getData(); writeAvroWithMetadata(key, avroPayload); } @Override default void write(String recordKey, HoodieRecord record, Schema schema, Properties props) throws IOException { - IndexedRecord avroPayload = (IndexedRecord)record.toIndexedRecord(schema, props).get(); + IndexedRecord avroPayload = ((HoodieAvroIndexedRecord) record.toIndexedRecord(schema, props).get()).getData(); writeAvro(recordKey, avroPayload); } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java new file mode 100644 index 0000000000000..0ab052a877340 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.compress.Compression; +import org.apache.orc.CompressionKind; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +import java.io.IOException; + +import static org.apache.hudi.io.storage.HoodieHFileConfig.CACHE_DATA_IN_L1; +import static org.apache.hudi.io.storage.HoodieHFileConfig.DROP_BEHIND_CACHE_COMPACTION; +import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR; +import static org.apache.hudi.io.storage.HoodieHFileConfig.PREFETCH_ON_OPEN; + +public class HoodieAvroFileWriterFactory extends HoodieFileWriterFactory { + + private static class SingletonHolder { + private static final HoodieAvroFileWriterFactory INSTANCE = new HoodieAvroFileWriterFactory(); + } + + private HoodieAvroFileWriterFactory() { + } + + public static HoodieFileWriterFactory getFileReaderFactory() { + return HoodieAvroFileWriterFactory.SingletonHolder.INSTANCE; + } + + protected HoodieFileWriter newParquetFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + boolean populateMetaFields = config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS); + boolean enableBloomFilter = populateMetaFields; + Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter); + String compressionCodecName = config.getStringOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME); + // Support PARQUET_COMPRESSION_CODEC_NAME is "" + if (compressionCodecName.isEmpty()) { + compressionCodecName = null; + } + HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, + CompressionCodecName.fromConf(compressionCodecName), + config.getIntOrDefault(HoodieStorageConfig.PARQUET_BLOCK_SIZE), + config.getIntOrDefault(HoodieStorageConfig.PARQUET_PAGE_SIZE), + config.getLongOrDefault(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE), + conf, config.getDoubleOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION), + config.getBooleanOrDefault(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED)); + return new HoodieAvroParquetWriter(path, parquetConfig, instantTime, taskContextSupplier, populateMetaFields); + } + + protected HoodieFileWriter newParquetFileWriter( + FSDataOutputStream outputStream, Configuration conf, HoodieConfig config, Schema schema) throws IOException { + boolean enableBloomFilter = false; + Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter); + HoodieParquetConfig parquetConfig = new HoodieParquetConfig<>(writeSupport, + CompressionCodecName.fromConf(config.getString(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME)), + config.getInt(HoodieStorageConfig.PARQUET_BLOCK_SIZE), + config.getInt(HoodieStorageConfig.PARQUET_PAGE_SIZE), + config.getLong(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE), // todo: 1024*1024*1024 + conf, config.getDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION)); + return new HoodieParquetStreamWriter(outputStream, parquetConfig); + } + + protected HoodieFileWriter newHFileFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + BloomFilter filter = createBloomFilter(config); + HoodieHFileConfig hfileConfig = new HoodieHFileConfig(conf, + Compression.Algorithm.valueOf(config.getString(HoodieStorageConfig.HFILE_COMPRESSION_ALGORITHM_NAME)), + config.getInt(HoodieStorageConfig.HFILE_BLOCK_SIZE), + config.getLong(HoodieStorageConfig.HFILE_MAX_FILE_SIZE), + HoodieAvroHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, + filter, HFILE_COMPARATOR); + + return new HoodieAvroHFileWriter(instantTime, path, hfileConfig, schema, taskContextSupplier, config.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS)); + } + + protected HoodieFileWriter newOrcFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + BloomFilter filter = createBloomFilter(config); + HoodieOrcConfig orcConfig = new HoodieOrcConfig(conf, + CompressionKind.valueOf(config.getString(HoodieStorageConfig.ORC_COMPRESSION_CODEC_NAME)), + config.getInt(HoodieStorageConfig.ORC_STRIPE_SIZE), + config.getInt(HoodieStorageConfig.ORC_BLOCK_SIZE), + config.getLong(HoodieStorageConfig.ORC_FILE_MAX_SIZE), filter); + return new HoodieAvroOrcWriter(instantTime, path, orcConfig, schema, taskContextSupplier); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileReader.java index 6ac2a94932267..90323d88831ba 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileReader.java @@ -171,21 +171,26 @@ public Set filterRowKeys(Set candidateRowKeys) { @SuppressWarnings("unchecked") @Override - public Option getRecordByKey(String key, Schema readerSchema) throws IOException { + public Option getIndexedRecordByKey(String key, Schema readerSchema) throws IOException { synchronized (sharedScannerLock) { return fetchRecordByKeyInternal(sharedScanner, key, getSchema(), readerSchema); } } - public ClosableIterator getRecordIterator(Schema readerSchema) throws IOException { + public ClosableIterator getIndexedRecordIterator(Schema readerSchema) throws IOException { // TODO eval whether seeking scanner would be faster than pread HFileScanner scanner = getHFileScanner(reader, false); return new RecordIterator(scanner, getSchema(), readerSchema); } + @Override + public ClosableIterator getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException { + throw new UnsupportedOperationException(); + } + @SuppressWarnings("unchecked") @Override - public ClosableIterator getRecordsByKeysIterator(List keys, Schema readerSchema) throws IOException { + public ClosableIterator getIndexedRecordsByKeysIterator(List keys, Schema readerSchema) throws IOException { // We're caching blocks for this scanner to minimize amount of traffic // to the underlying storage as we fetched (potentially) sparsely distributed // keys @@ -195,7 +200,7 @@ public ClosableIterator getRecordsByKeysIterator(List key @SuppressWarnings("unchecked") @Override - public ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes, Schema readerSchema) throws IOException { + public ClosableIterator getIndexedRecordsByKeyPrefixIterator(List keyPrefixes, Schema readerSchema) throws IOException { // We're caching blocks for this scanner to minimize amount of traffic // to the underlying storage as we fetched (potentially) sparsely distributed // keys @@ -372,7 +377,7 @@ private static byte[] copyValueFromCell(Cell c) { */ public static List readAllRecords(HoodieAvroHFileReader reader) throws IOException { Schema schema = reader.getSchema(); - return toStream(reader.getRecordIterator(schema)) + return toStream(reader.getIndexedRecordIterator(schema)) .collect(Collectors.toList()); } @@ -395,7 +400,7 @@ public static List readRecords(HoodieAvroHFileReader reader, List keys, Schema schema) throws IOException { Collections.sort(keys); - return toStream(reader.getRecordsByKeysIterator(keys, schema)) + return toStream(reader.getIndexedRecordsByKeysIterator(keys, schema)) .collect(Collectors.toList()); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileWriter.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileWriter.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileWriter.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroHFileWriter.java diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcReader.java index 8f953d25b5ab6..051add110282e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcReader.java @@ -65,7 +65,7 @@ public Set filterRowKeys(Set candidateRowKeys) { } @Override - public ClosableIterator getRecordIterator(Schema schema) throws IOException { + public ClosableIterator getIndexedRecordIterator(Schema schema) throws IOException { try { Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(schema); @@ -76,6 +76,11 @@ public ClosableIterator getRecordIterator(Schema schema) throws I } } + @Override + public ClosableIterator getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException { + throw new UnsupportedOperationException(); + } + @Override public Schema getSchema() { return orcUtils.readAvroSchema(conf, path); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcWriter.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcWriter.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcWriter.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroOrcWriter.java diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java index 83ffaf589f8ae..769ef391b4630 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java @@ -26,6 +26,7 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.util.BaseFileUtils; import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ParquetReaderIterator; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; @@ -65,12 +66,13 @@ public Set filterRowKeys(Set candidateRowKeys) { } @Override - public ClosableIterator getRecordIterator(Schema schema) throws IOException { - AvroReadSupport.setAvroReadSchema(conf, schema); - ParquetReader reader = AvroParquetReader.builder(path).withConf(conf).build(); - ParquetReaderIterator parquetReaderIterator = new ParquetReaderIterator<>(reader); - readerIterators.add(parquetReaderIterator); - return parquetReaderIterator; + public ClosableIterator getIndexedRecordIterator(Schema schema) throws IOException { + return getIndexedRecordIteratorInternal(schema, Option.empty()); + } + + @Override + public ClosableIterator getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException { + return getIndexedRecordIteratorInternal(readerSchema, Option.of(requestedSchema)); } @Override @@ -87,4 +89,15 @@ public void close() { public long getTotalRecords() { return parquetUtils.getRowCount(conf, path); } + + private ClosableIterator getIndexedRecordIteratorInternal(Schema schema, Option requestedSchema) throws IOException { + AvroReadSupport.setAvroReadSchema(conf, schema); + if (requestedSchema.isPresent()) { + AvroReadSupport.setRequestedProjection(conf, requestedSchema.get()); + } + ParquetReader reader = AvroParquetReader.builder(path).withConf(conf).build(); + ParquetReaderIterator parquetReaderIterator = new ParquetReaderIterator<>(reader); + readerIterators.add(parquetReaderIterator); + return parquetReaderIterator; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetWriter.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetWriter.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetWriter.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetWriter.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java index 175722bb9be85..8152a176a074b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java @@ -29,7 +29,7 @@ import java.util.List; import java.util.Set; -public interface HoodieFileReader extends AutoCloseable { +public interface HoodieFileReader extends AutoCloseable { String[] readMinMaxRecordKeys(); @@ -37,34 +37,36 @@ public interface HoodieFileReader extends AutoCloseable { Set filterRowKeys(Set candidateRowKeys); - ClosableIterator getRecordIterator(Schema readerSchema, HoodieRecord.Mapper mapper) throws IOException; + ClosableIterator> getRecordIterator(Schema readerSchema) throws IOException; - default ClosableIterator getRecordIterator(HoodieRecord.Mapper mapper) throws IOException { - return getRecordIterator(getSchema(), mapper); + ClosableIterator> getRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException; + + default ClosableIterator> getRecordIterator() throws IOException { + return getRecordIterator(getSchema()); } - default Option getRecordByKey(String key, Schema readerSchema, HoodieRecord.Mapper mapper) throws IOException { + default Option> getRecordByKey(String key, Schema readerSchema) throws IOException { throw new UnsupportedOperationException(); } - default Option getRecordByKey(String key, HoodieRecord.Mapper mapper) throws IOException { - return getRecordByKey(key, getSchema(), mapper); + default Option> getRecordByKey(String key) throws IOException { + return getRecordByKey(key, getSchema()); } - default ClosableIterator getRecordsByKeysIterator(List keys, Schema schema, HoodieRecord.Mapper mapper) throws IOException { + default ClosableIterator> getRecordsByKeysIterator(List keys, Schema schema) throws IOException { throw new UnsupportedOperationException(); } - default ClosableIterator getRecordsByKeysIterator(List keys, HoodieRecord.Mapper mapper) throws IOException { - return getRecordsByKeysIterator(keys, getSchema(), mapper); + default ClosableIterator> getRecordsByKeysIterator(List keys) throws IOException { + return getRecordsByKeysIterator(keys, getSchema()); } - default ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes, Schema schema, HoodieRecord.Mapper mapper) throws IOException { + default ClosableIterator> getRecordsByKeyPrefixIterator(List keyPrefixes, Schema schema) throws IOException { throw new UnsupportedEncodingException(); } - default ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes, HoodieRecord.Mapper mapper) throws IOException { - return getRecordsByKeyPrefixIterator(keyPrefixes, getSchema(), mapper); + default ClosableIterator> getRecordsByKeyPrefixIterator(List keyPrefixes) throws IOException { + return getRecordsByKeyPrefixIterator(keyPrefixes, getSchema()); } Schema getSchema(); diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java index d2e35eb450a3e..f84ec530add91 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java @@ -20,10 +20,16 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.io.hfile.CacheConfig; + import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.exception.HoodieException; import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import static org.apache.hudi.common.model.HoodieFileFormat.HFILE; import static org.apache.hudi.common.model.HoodieFileFormat.ORC; @@ -31,7 +37,24 @@ public class HoodieFileReaderFactory { - public static HoodieAvroFileReader getFileReader(Configuration conf, Path path) throws IOException { + public static HoodieFileReaderFactory getReaderFactory(HoodieRecord.HoodieRecordType recordType) { + switch (recordType) { + case AVRO: + return HoodieAvroFileReaderFactory.getFileReaderFactory(); + case SPARK: + try { + Class clazz = ReflectionUtils.getClass("org.apache.hudi.io.storage.HoodieSparkFileReaderFactory"); + Method method = clazz.getMethod("getFileReaderFactory", null); + return (HoodieFileReaderFactory) method.invoke(null,null); + } catch (NoSuchMethodException | IllegalArgumentException | InvocationTargetException | IllegalAccessException e) { + throw new HoodieException("Unable to create hoodie spark file writer factory", e); + } + default: + throw new UnsupportedOperationException(recordType + " record type not supported yet."); + } + } + + public HoodieFileReader getFileReader(Configuration conf, Path path) throws IOException { final String extension = FSUtils.getFileExtension(path.toString()); if (PARQUET.getFileExtension().equals(extension)) { return newParquetFileReader(conf, path); @@ -46,16 +69,19 @@ public static HoodieAvroFileReader getFileReader(Configuration conf, Path path) throw new UnsupportedOperationException(extension + " format not supported yet."); } - private static HoodieAvroFileReader newParquetFileReader(Configuration conf, Path path) { - return new HoodieAvroParquetReader(conf, path); + public HoodieFileReader getFileReader(Configuration conf, Path path, HoodieFileFormat format) throws IOException { + return this.newParquetFileReader(conf, path); + } + + protected HoodieFileReader newParquetFileReader(Configuration conf, Path path) { + throw new UnsupportedOperationException(); } - private static HoodieAvroFileReader newHFileFileReader(Configuration conf, Path path) throws IOException { - CacheConfig cacheConfig = new CacheConfig(conf); - return new HoodieAvroHFileReader(conf, path, cacheConfig); + protected HoodieFileReader newHFileFileReader(Configuration conf, Path path) throws IOException { + throw new UnsupportedOperationException(); } - private static HoodieAvroFileReader newOrcFileReader(Configuration conf, Path path) { - return new HoodieAvroOrcReader(conf, path); + protected HoodieFileReader newOrcFileReader(Configuration conf, Path path) { + throw new UnsupportedOperationException(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java new file mode 100644 index 0000000000000..99f35d7a0f051 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.exception.HoodieException; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +import static org.apache.hudi.common.model.HoodieFileFormat.HFILE; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + +public class HoodieFileWriterFactory { + + private static HoodieFileWriterFactory getWriterFactory(HoodieRecord.HoodieRecordType recordType) { + switch (recordType) { + case AVRO: + return HoodieAvroFileWriterFactory.getFileReaderFactory(); + case SPARK: + try { + Class clazz = ReflectionUtils.getClass("org.apache.hudi.io.storage.HoodieSparkFileWriterFactory"); + Method method = clazz.getMethod("getFileWriterFactory", null); + return (HoodieFileWriterFactory) method.invoke(null, null); + } catch (NoSuchMethodException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) { + throw new HoodieException("Unable to create hoodie spark file writer factory", e); + } + default: + throw new UnsupportedOperationException(recordType + " record type not supported yet."); + } + } + + public static HoodieFileWriter getFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier, HoodieRecordType recordType) throws IOException { + final String extension = FSUtils.getFileExtension(path.getName()); + HoodieFileWriterFactory factory = getWriterFactory(recordType); + return factory.getFileWriterByFormat(extension, instantTime, path, conf, config, schema, taskContextSupplier); + } + + public static HoodieFileWriter getFileWriter(HoodieFileFormat format, + FSDataOutputStream outputStream, Configuration conf, HoodieConfig config, Schema schema, HoodieRecordType recordType) throws IOException { + HoodieFileWriterFactory factory = getWriterFactory(recordType); + return factory.getFileWriterByFormat(format, outputStream, conf, config, schema); + } + + protected HoodieFileWriter getFileWriterByFormat( + String extension, String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + if (PARQUET.getFileExtension().equals(extension)) { + return newParquetFileWriter(instantTime, path, conf, config, schema, taskContextSupplier); + } + if (HFILE.getFileExtension().equals(extension)) { + return newHFileFileWriter(instantTime, path, conf, config, schema, taskContextSupplier); + } + if (ORC.getFileExtension().equals(extension)) { + return newOrcFileWriter(instantTime, path, conf, config, schema, taskContextSupplier); + } + throw new UnsupportedOperationException(extension + " format not supported yet."); + } + + protected HoodieFileWriter getFileWriterByFormat(HoodieFileFormat format, + FSDataOutputStream outputStream, Configuration conf, HoodieConfig config, Schema schema) throws IOException { + switch (format) { + case PARQUET: + return newParquetFileWriter(outputStream, conf, config, schema); + default: + throw new UnsupportedOperationException(format + " format not supported yet."); + } + } + + protected HoodieFileWriter newParquetFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + throw new UnsupportedOperationException(); + } + + protected HoodieFileWriter newParquetFileWriter( + FSDataOutputStream outputStream, Configuration conf, HoodieConfig config, Schema schema) throws IOException { + throw new UnsupportedOperationException(); + } + + protected HoodieFileWriter newHFileFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + throw new UnsupportedOperationException(); + } + + protected HoodieFileWriter newOrcFileWriter( + String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + throw new UnsupportedOperationException(); + } + + protected BloomFilter createBloomFilter(HoodieConfig config) { + return BloomFilterFactory.createBloomFilter(60000, 0.000000001, 100000, + BloomFilterTypeCode.DYNAMIC_V0.name()); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcConfig.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcConfig.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcConfig.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcConfig.java diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java index 8b5bad3233a2e..f777c55e89e0f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java @@ -32,9 +32,9 @@ import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -50,7 +50,7 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.TableNotFoundException; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.avro.Schema; @@ -97,7 +97,7 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { private final boolean reuse; // Readers for the latest file slice corresponding to file groups in the metadata partition - private Map, Pair> partitionReaders = + private Map, Pair> partitionReaders = new ConcurrentHashMap<>(); public HoodieBackedTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, @@ -164,12 +164,12 @@ public HoodieData> getRecordsByKeyPrefixes(L (SerializableFunction>>) fileSlice -> { // NOTE: Since this will be executed by executors, we can't access previously cached // readers, and therefore have to always open new ones - Pair readers = + Pair readers = openReaders(partitionName, fileSlice); try { List timings = new ArrayList<>(); - HoodieAvroFileReader baseFileReader = readers.getKey(); + HoodieFileReader baseFileReader = readers.getKey(); HoodieMetadataMergedLogRecordReader logRecordScanner = readers.getRight(); if (baseFileReader == null && logRecordScanner == null) { @@ -210,11 +210,11 @@ public List>>> getRecord List>>> result = new ArrayList<>(); AtomicInteger fileSlicesKeysCount = new AtomicInteger(); partitionFileSliceToKeysMap.forEach((partitionFileSlicePair, fileSliceKeys) -> { - Pair readers = + Pair readers = getOrCreateReaders(partitionName, partitionFileSlicePair.getRight()); try { List timings = new ArrayList<>(); - HoodieAvroFileReader baseFileReader = readers.getKey(); + HoodieFileReader baseFileReader = readers.getKey(); HoodieMetadataMergedLogRecordReader logRecordScanner = readers.getRight(); if (baseFileReader == null && logRecordScanner == null) { return; @@ -281,7 +281,7 @@ private Map>> readLogRecords( return logRecords; } - private List>>> readFromBaseAndMergeWithLogRecords(HoodieAvroFileReader baseFileReader, + private List>>> readFromBaseAndMergeWithLogRecords(HoodieFileReader baseFileReader, List keys, boolean fullKeys, Map>> logRecords, @@ -342,12 +342,12 @@ private List>>> readFrom } } - private Map> fetchBaseFileRecordsByKeys(HoodieAvroFileReader baseFileReader, + private Map> fetchBaseFileRecordsByKeys(HoodieFileReader baseFileReader, List keys, boolean fullKeys, String partitionName) throws IOException { - ClosableIterator records = fullKeys ? baseFileReader.getRecordsByKeysIterator(keys, HoodieAvroIndexedRecord::new) - : baseFileReader.getRecordsByKeyPrefixIterator(keys, HoodieAvroIndexedRecord::new); + ClosableIterator records = fullKeys ? baseFileReader.getRecordsByKeysIterator(keys) + : baseFileReader.getRecordsByKeyPrefixIterator(keys); return toStream(records) .map(record -> { @@ -402,7 +402,7 @@ private Map, List> getPartitionFileSliceToKeysMa * @param slice - The file slice to open readers for * @return File reader and the record scanner pair for the requested file slice */ - private Pair getOrCreateReaders(String partitionName, FileSlice slice) { + private Pair getOrCreateReaders(String partitionName, FileSlice slice) { if (reuse) { return partitionReaders.computeIfAbsent(Pair.of(partitionName, slice.getFileId()), k -> { return openReaders(partitionName, slice); }); @@ -411,12 +411,12 @@ private Pair getOrCre } } - private Pair openReaders(String partitionName, FileSlice slice) { + private Pair openReaders(String partitionName, FileSlice slice) { try { HoodieTimer timer = new HoodieTimer().startTimer(); // Open base file reader - Pair baseFileReaderOpenTimePair = getBaseFileReader(slice, timer); - HoodieAvroFileReader baseFileReader = baseFileReaderOpenTimePair.getKey(); + Pair baseFileReaderOpenTimePair = getBaseFileReader(slice, timer); + HoodieFileReader baseFileReader = baseFileReaderOpenTimePair.getKey(); final long baseFileOpenMs = baseFileReaderOpenTimePair.getValue(); // Open the log record scanner using the log files from the latest file slice @@ -434,14 +434,14 @@ private Pair openRead } } - private Pair getBaseFileReader(FileSlice slice, HoodieTimer timer) throws IOException { - HoodieAvroFileReader baseFileReader = null; + private Pair getBaseFileReader(FileSlice slice, HoodieTimer timer) throws IOException { + HoodieFileReader baseFileReader = null; Long baseFileOpenMs; // If the base file is present then create a reader Option basefile = slice.getBaseFile(); if (basefile.isPresent()) { String basefilePath = basefile.get().getPath(); - baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath)); + baseFileReader = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(hadoopConf.get(), new Path(basefilePath)); baseFileOpenMs = timer.endTimer(); LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath, basefile.get().getCommitTime(), baseFileOpenMs)); @@ -572,7 +572,7 @@ public void close() { * @param partitionFileSlicePair - Partition and FileSlice */ private synchronized void close(Pair partitionFileSlicePair) { - Pair readers = + Pair readers = partitionReaders.remove(partitionFileSlicePair); closeReader(readers); } @@ -587,7 +587,7 @@ private void closePartitionReaders() { partitionReaders.clear(); } - private void closeReader(Pair readers) { + private void closeReader(Pair readers) { if (readers != null) { try { if (readers.getKey() != null) { diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java index cbd7e6c17511c..1d21ceabdca65 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java @@ -19,17 +19,14 @@ package org.apache.hudi.metadata; import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; import org.apache.hudi.common.config.HoodieMetadataConfig; -import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.log.InstantRange; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.internal.schema.InternalSchema; @@ -61,24 +58,8 @@ private HoodieMetadataMergedLogRecordReader(FileSystem fs, String basePath, Stri boolean isBitCaskDiskMapCompressionEnabled, Option instantRange, boolean allowFullScan) { super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, true, false, bufferSize, - spillableMapBasePath, instantRange, diskMapType, isBitCaskDiskMapCompressionEnabled, false, allowFullScan, Option.of(partitionName), InternalSchema.getEmptyInternalSchema()); - } - - @Override - protected HoodieAvroRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, - final String payloadClassFQN, final String preCombineField, - final boolean withOperationField, - final Option> simpleKeyGenFields, - final Option partitionName) { - if (hoodieTableConfig.populateMetaFields()) { - return super.createHoodieRecord(rec, hoodieTableConfig, payloadClassFQN, preCombineField, withOperationField, - simpleKeyGenFields, partitionName); - } - - // When meta fields are not available, create the record using the - // preset key field and the known partition name - return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, payloadClassFQN, - preCombineField, simpleKeyGenFields.get(), withOperationField, partitionName); + spillableMapBasePath, instantRange, diskMapType, isBitCaskDiskMapCompressionEnabled, false, allowFullScan, Option.of(partitionName), + InternalSchema.getEmptyInternalSchema(), HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())); } /** @@ -128,6 +109,11 @@ public synchronized List } } + @Override + protected boolean getPopulateMetaFields() { + return this.hoodieTableMetaClient.getTableConfig().populateMetaFields() && super.getPopulateMetaFields(); + } + @Override protected String getKeyField() { return HoodieMetadataPayload.KEY_FIELD_NAME; diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java index 349c0efb482a5..cb78ad8dd1826 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java @@ -92,6 +92,9 @@ static String getDatasetBasePath(String metadataTableBasePath) { * @param basePath The base path to check */ static boolean isMetadataTable(String basePath) { + if (basePath == null || basePath.isEmpty()) { + return false; + } if (basePath.endsWith(Path.SEPARATOR)) { basePath = basePath.substring(0, basePath.length() - 1); } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 36045723683fc..d932661bb0c4d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -34,6 +34,7 @@ import org.apache.hudi.common.model.HoodieDeltaWriteStat; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.WriteOperationType; @@ -53,7 +54,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieMetadataException; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.util.Lazy; @@ -413,8 +414,8 @@ public static HoodieData convertMetadataToBloomFilterRecords( } final Path writeFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition); - try (HoodieAvroFileReader fileReader = - HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), writeFilePath)) { + try (HoodieFileReader fileReader = + HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), writeFilePath)) { try { final BloomFilter fileBloomFilter = fileReader.readBloomFilter(); if (fileBloomFilter == null) { @@ -873,8 +874,8 @@ public static HoodieData convertFilesToBloomFilterRecords(HoodieEn } final String pathWithPartition = partitionName + "/" + appendedFile; final Path appendedFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition); - try (HoodieAvroFileReader fileReader = - HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), appendedFilePath)) { + try (HoodieFileReader fileReader = + HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), appendedFilePath)) { final BloomFilter fileBloomFilter = fileReader.readBloomFilter(); if (fileBloomFilter == null) { LOG.error("Failed to read bloom filter for " + appendedFilePath); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java index 289180580b11a..596a109108654 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java @@ -24,8 +24,10 @@ import org.apache.hudi.common.model.HoodieArchivedLogFile; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.log.AppendResult; @@ -50,6 +52,7 @@ import org.apache.hudi.common.testutils.SchemaTestUtil; import org.apache.hudi.common.testutils.minicluster.MiniClusterUtil; import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.exception.CorruptedLogFileException; @@ -591,6 +594,7 @@ public void testBasicAppendAndScanMultipleFiles(ExternalSpillableMap.DiskMapType .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); List scannedRecords = new ArrayList<>(); @@ -805,6 +809,7 @@ public void testAvroLogRecordReaderBasic(ExternalSpillableMap.DiskMapType diskMa .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(200, scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); @@ -883,6 +888,7 @@ public void testAvroLogRecordReaderWithRollbackTombstone(ExternalSpillableMap.Di .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We read 200 records from 2 write batches"); Set readKeys = new HashSet<>(200); @@ -970,6 +976,7 @@ public void testAvroLogRecordReaderWithFailedPartialBlock(ExternalSpillableMap.D .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We would read 200 records"); Set readKeys = new HashSet<>(200); @@ -1048,6 +1055,7 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(ExternalSpillableMap.Di .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We still would read 200 records"); @@ -1094,6 +1102,7 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(ExternalSpillableMap.Di .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals(200, readKeys.size(), "Stream collect should return all 200 records after rollback of delete"); @@ -1189,6 +1198,7 @@ public void testAvroLogRecordReaderWithDisorderDelete(ExternalSpillableMap.DiskM .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We still would read 200 records"); @@ -1292,6 +1302,7 @@ public void testAvroLogRecordReaderWithFailedRollbacks(ExternalSpillableMap.Disk .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would have scanned 0 records because of rollback"); @@ -1360,6 +1371,7 @@ public void testAvroLogRecordReaderWithInsertDeleteAndRollback(ExternalSpillable .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); FileCreateUtils.deleteDeltaCommit(basePath, "100", fs); @@ -1411,6 +1423,7 @@ public void testAvroLogRecordReaderWithInvalidRollback(ExternalSpillableMap.Disk .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(100, scanner.getTotalLogRecords(), "We still would read 100 records"); final List readKeys = new ArrayList<>(100); @@ -1481,6 +1494,7 @@ public void testAvroLogRecordReaderWithInsertsDeleteAndRollback(ExternalSpillabl .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); } @@ -1587,6 +1601,7 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(ExternalS .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); FileCreateUtils.deleteDeltaCommit(basePath, "100", fs); @@ -1661,6 +1676,7 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); assertEquals(Math.max(numRecordsInLog1, numRecordsInLog2), scanner.getNumMergedRecordsInLog(), @@ -2040,10 +2056,10 @@ private static Stream testArguments() { * Utility to convert the given iterator to a List. */ private static List getRecords(HoodieDataBlock dataBlock) { - ClosableIterator itr = dataBlock.getRecordIterator(HoodieAvroIndexedRecord::new); + ClosableIterator> itr = dataBlock.getRecordIterator(HoodieRecordType.AVRO); List elements = new ArrayList<>(); - itr.forEachRemaining(r -> elements.add((IndexedRecord) r.getData())); + itr.forEachRemaining(r -> elements.add(r.getData())); return elements; } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java index 0defefe2ea4e4..69b183fa09612 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java @@ -63,7 +63,7 @@ public void setUp() throws Exception { @Test public void testCreate() throws IOException { assertTrue(fs.exists(new Path(metaPath, HoodieTableConfig.HOODIE_PROPERTIES_FILE))); - HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null, null); assertEquals(6, config.getProps().size()); } @@ -76,7 +76,7 @@ public void testUpdate() throws IOException { assertTrue(fs.exists(cfgPath)); assertFalse(fs.exists(backupCfgPath)); - HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null, null); assertEquals(7, config.getProps().size()); assertEquals("test-table2", config.getTableName()); assertEquals("new_field", config.getPreCombineField()); @@ -89,7 +89,7 @@ public void testDelete() throws IOException { assertTrue(fs.exists(cfgPath)); assertFalse(fs.exists(backupCfgPath)); - HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null, null); assertEquals(5, config.getProps().size()); assertNull(config.getProps().getProperty("hoodie.invalid.config")); assertFalse(config.getProps().contains(HoodieTableConfig.ARCHIVELOG_FOLDER.key())); @@ -99,13 +99,13 @@ public void testDelete() throws IOException { public void testReadsWhenPropsFileDoesNotExist() throws IOException { fs.delete(cfgPath, false); assertThrows(HoodieIOException.class, () -> { - new HoodieTableConfig(fs, metaPath.toString(), null); + new HoodieTableConfig(fs, metaPath.toString(), null, null); }); } @Test public void testReadsWithUpdateFailures() throws IOException { - HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null, null); fs.delete(cfgPath, false); try (FSDataOutputStream out = fs.create(backupCfgPath)) { config.getProps().store(out, ""); @@ -113,14 +113,14 @@ public void testReadsWithUpdateFailures() throws IOException { assertFalse(fs.exists(cfgPath)); assertTrue(fs.exists(backupCfgPath)); - config = new HoodieTableConfig(fs, metaPath.toString(), null); + config = new HoodieTableConfig(fs, metaPath.toString(), null, null); assertEquals(6, config.getProps().size()); } @ParameterizedTest @ValueSource(booleans = {true, false}) public void testUpdateRecovery(boolean shouldPropsFileExist) throws IOException { - HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null, null); if (!shouldPropsFileExist) { fs.delete(cfgPath, false); } @@ -131,7 +131,7 @@ public void testUpdateRecovery(boolean shouldPropsFileExist) throws IOException HoodieTableConfig.recoverIfNeeded(fs, cfgPath, backupCfgPath); assertTrue(fs.exists(cfgPath)); assertFalse(fs.exists(backupCfgPath)); - config = new HoodieTableConfig(fs, metaPath.toString(), null); + config = new HoodieTableConfig(fs, metaPath.toString(), null, null); assertEquals(6, config.getProps().size()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/HoodieRecordUtilsTest.java b/hudi-common/src/test/java/org/apache/hudi/common/util/HoodieRecordUtilsTest.java index 0c51571c9e1b8..4dcabecccb7b9 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/HoodieRecordUtilsTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/HoodieRecordUtilsTest.java @@ -20,8 +20,8 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.model.DefaultHoodieRecordPayload; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.HoodieRecordPayload; import org.junit.jupiter.api.Test; @@ -31,11 +31,11 @@ class HoodieRecordUtilsTest { @Test void loadHoodieMerge() { - String mergeClassName = HoodieAvroRecordMerge.class.getName(); - HoodieMerge merge1 = HoodieRecordUtils.loadMerge(mergeClassName); - HoodieMerge merge2 = HoodieRecordUtils.loadMerge(mergeClassName); - assertEquals(merge1.getClass().getName(), mergeClassName); - assertEquals(merge1, merge2); + String mergeClassName = HoodieAvroRecordMerger.class.getName(); + HoodieRecordMerger recordMerger1 = HoodieRecordUtils.loadRecordMerger(mergeClassName); + HoodieRecordMerger recordMerger2 = HoodieRecordUtils.loadRecordMerger(mergeClassName); + assertEquals(recordMerger1.getClass().getName(), mergeClassName); + assertEquals(recordMerger1, recordMerger2); } @Test diff --git a/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileReaderFactory.java b/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileReaderFactory.java index f049033688efb..c306bab384b07 100644 --- a/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileReaderFactory.java +++ b/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieAvroFileReaderFactory.java @@ -18,6 +18,8 @@ package org.apache.hudi.io.storage; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.Test; @@ -40,19 +42,19 @@ public void testGetFileReader() throws IOException { // parquet file format. final Configuration hadoopConf = new Configuration(); final Path parquetPath = new Path("/partition/path/f1_1-0-1_000.parquet"); - HoodieAvroFileReader parquetReader = HoodieFileReaderFactory.getFileReader(hadoopConf, parquetPath); + HoodieFileReader parquetReader = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(hadoopConf, parquetPath); assertTrue(parquetReader instanceof HoodieAvroParquetReader); // log file format. final Path logPath = new Path("/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1"); final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> { - HoodieAvroFileReader logWriter = HoodieFileReaderFactory.getFileReader(hadoopConf, logPath); + HoodieFileReader logWriter = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(hadoopConf, logPath); }, "should fail since log storage reader is not supported yet."); assertTrue(thrown.getMessage().contains("format not supported yet.")); // Orc file format. final Path orcPath = new Path("/partition/path/f1_1-0-1_000.orc"); - HoodieAvroFileReader orcReader = HoodieFileReaderFactory.getFileReader(hadoopConf, orcPath); + HoodieFileReader orcReader = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(hadoopConf, orcPath); assertTrue(orcReader instanceof HoodieAvroOrcReader); } } diff --git a/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java index 7fc93c776f5a8..ab92990718685 100644 --- a/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java +++ b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java @@ -38,7 +38,9 @@ import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations; import org.apache.parquet.Strings; import org.apache.parquet.avro.AvroParquetReader; @@ -360,6 +362,7 @@ private static HoodieMergedLogRecordScanner getScanner( .withSpillableMapBasePath("/tmp/") .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java index 9957e0c98048e..b969074f60963 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -23,10 +23,11 @@ import org.apache.hudi.common.config.ConfigGroups; import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.model.EventTimeAvroPayload; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; @@ -116,6 +117,20 @@ private FlinkOptions() { .withDescription("Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting.\n" + "This will render any value set for the option in-effective"); + public static final ConfigOption RECORD_MERGER_IMPLS = ConfigOptions + .key("record.merger.impls") + .stringType() + .defaultValue(HoodieAvroRecordMerger.class.getName()) + .withDescription("List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. " + + "These merger impls will filter by record.merger.strategy. " + + "Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)"); + + public static final ConfigOption RECORD_MERGER_STRATEGY = ConfigOptions + .key("record.merger.strategy") + .stringType() + .defaultValue(StringUtils.DEFAULT_MERGER_STRATEGY_UUID) + .withDescription("Id of merger strategy. Hudi will pick RecordMergers in record.merger.impls which has the same merger strategy id"); + public static final ConfigOption PARTITION_DEFAULT_NAME = ConfigOptions .key("partition.default_name") .stringType() @@ -301,13 +316,6 @@ private FlinkOptions() { .defaultValue(WriteOperationType.UPSERT.value()) .withDescription("The write operation, that this write should do"); - public static final ConfigOption MERGE_CLASS_NAME = ConfigOptions - .key("write.merge.class") - .stringType() - .defaultValue(HoodieAvroRecordMerge.class.getName()) - .withDescription("Merge class provide stateless component interface for merging records, and support various HoodieRecord " - + "types, such as Spark records or Flink records."); - /** * Flag to indicate whether to drop duplicates before insert/upsert. * By default false to gain extra performance. diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java index b6b65bb8f1d76..a9f88cd028589 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java @@ -23,11 +23,10 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.ObjectSizeCalculator; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.configuration.FlinkOptions; @@ -52,6 +51,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.Random; import java.util.function.BiFunction; import java.util.stream.Collectors; @@ -104,7 +104,7 @@ public class StreamWriteFunction extends AbstractStreamWriteFunction { private transient BiFunction, String, List> writeFunction; - private transient HoodieMerge merge; + private transient HoodieRecordMerger recordMerger; /** * Total size tracer. @@ -201,9 +201,8 @@ private void initWriteFunction() { } private void initMergeClass() { - String mergeClassName = metaClient.getTableConfig().getMergeClass(); - LOG.info("init hoodie merge with class [{}]", mergeClassName); - merge = HoodieRecordUtils.loadMerge(mergeClassName); + recordMerger = writeClient.getConfig().getRecordMerger(); + LOG.info("init hoodie merge with class [{}]", recordMerger.getClass().getName()); } /** @@ -432,7 +431,9 @@ private boolean flushBucket(DataBucket bucket) { List records = bucket.writeBuffer(); ValidationUtils.checkState(records.size() > 0, "Data bucket to flush has no buffering records"); if (config.getBoolean(FlinkOptions.PRE_COMBINE)) { - records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1, merge); + Properties props = new Properties(); + config.addAllToProperties(props); + records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1, recordMerger, props); } bucket.preWrite(records); final List writeStatus = new ArrayList<>(writeFunction.apply(records, instant)); @@ -467,7 +468,9 @@ private void flushRemaining(boolean endInput) { List records = bucket.writeBuffer(); if (records.size() > 0) { if (config.getBoolean(FlinkOptions.PRE_COMBINE)) { - records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1, merge); + Properties props = new Properties(); + config.addAllToProperties(props); + records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1, recordMerger, props); } bucket.preWrite(records); writeStatus.addAll(writeFunction.apply(records, currentInstant)); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java index 5fd2510fd50a4..da4504c8e6693 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java @@ -31,7 +31,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.OptionsResolver; @@ -39,6 +39,7 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.IOUtils; import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.sink.bulk.BulkInsertWriterHelper; import org.apache.hudi.sink.bulk.sort.SortOperatorGen; @@ -262,9 +263,9 @@ private Iterator readRecordsForGroupWithLogs(List for (ClusteringOperation clusteringOp : clusteringOps) { try { - Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) + Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) ? Option.empty() - : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); + : Option.of(HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType()).getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(table.getMetaClient().getFs()) .withBasePath(table.getMetaClient().getBasePath()) @@ -278,18 +279,18 @@ private Iterator readRecordsForGroupWithLogs(List .withSpillableMapBasePath(writeConfig.getSpillableMapBasePath()) .withDiskMapType(writeConfig.getCommonConfig().getSpillableDiskMapType()) .withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .withRecordMerger(writeConfig.getRecordMerger()) .build(); HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); HoodieFileSliceReader hoodieFileSliceReader = HoodieFileSliceReader.getFileSliceReader(baseFileReader, scanner, readerSchema, - tableConfig.getPayloadClass(), - tableConfig.getPreCombineField(), + tableConfig.getProps(), tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp()))); recordIterators.add(StreamSupport.stream(Spliterators.spliteratorUnknownSize(hoodieFileSliceReader, Spliterator.NONNULL), false).map(hoodieRecord -> { try { - return this.transform(hoodieRecord.toIndexedRecord(readerSchema, new Properties()).get()); + return this.transform(hoodieRecord.toIndexedRecord(readerSchema, new Properties()).get().getData()); } catch (IOException e) { throw new HoodieIOException("Failed to read next record", e); } @@ -310,7 +311,8 @@ private Iterator readRecordsForGroupBaseFiles(List List> iteratorsForPartition = clusteringOps.stream().map(clusteringOp -> { Iterable indexedRecords = () -> { try { - return HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath())).getRecordIterator(readerSchema); + return ((HoodieAvroFileReader)HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType()) + .getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))).getIndexedRecordIterator(readerSchema); } catch (IOException e) { throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java index db13a6c2ae584..2dd89e96dabc7 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java @@ -19,6 +19,7 @@ package org.apache.hudi.sink.partitioner.profile; import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -79,7 +80,7 @@ public class WriteProfile { /** * Total records to write for each bucket based on - * the config option {@link org.apache.hudi.config.HoodieStorageConfig#PARQUET_MAX_FILE_SIZE}. + * the config option {@link HoodieStorageConfig#PARQUET_MAX_FILE_SIZE}. */ private long recordsPerBucket; diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java index 3143f55c6373f..517e57be26fff 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java @@ -19,7 +19,7 @@ package org.apache.hudi.streamer; import org.apache.hudi.client.utils.OperationConverter; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -119,9 +119,13 @@ public class FlinkStreamerConfig extends Configuration { + "a GenericRecord. Implement your own, if you want to do something other than overwriting existing value.") public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName(); - @Parameter(names = {"--merge-class"}, description = "Implements of HoodieMerge, that defines how to merge two records." - + "Implement your own, if you want to implement specific record merge logic.") - public String mergeClassName = HoodieAvroRecordMerge.class.getName(); + @Parameter(names = {"--merger-impls"}, description = "List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. " + + "These merger impls will filter by merger-strategy " + + "Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)") + public String mergerImpls = HoodieAvroRecordMerger.class.getName(); + + @Parameter(names = {"--merger-strategy"}, description = "Id of merger strategy. Hudi will pick RecordMergers in merger-impls which has the same merger strategy id") + public String mergerStrategy = StringUtils.DEFAULT_MERGER_STRATEGY_UUID; @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input " + "is purely new data/inserts to gain speed).", converter = OperationConverter.class) @@ -368,7 +372,8 @@ public static org.apache.flink.configuration.Configuration toFlinkConfig(FlinkSt conf.setString(FlinkOptions.OPERATION, config.operation.value()); conf.setString(FlinkOptions.PRECOMBINE_FIELD, config.sourceOrderingField); conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, config.payloadClassName); - conf.setString(FlinkOptions.MERGE_CLASS_NAME, config.mergeClassName); + conf.setString(FlinkOptions.RECORD_MERGER_IMPLS, config.mergerImpls); + conf.setString(FlinkOptions.RECORD_MERGER_STRATEGY, config.mergerStrategy); conf.setBoolean(FlinkOptions.PRE_COMBINE, config.preCombine); conf.setInteger(FlinkOptions.RETRY_TIMES, Integer.parseInt(config.instantRetryTimes)); conf.setLong(FlinkOptions.RETRY_INTERVAL_MS, Long.parseLong(config.instantRetryInterval)); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java index 5e8239cf6d1c9..6fac5e4b8810e 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java @@ -423,8 +423,7 @@ private MergeOnReadInputFormat mergeOnReadInputFormat( tableAvroSchema.toString(), AvroSchemaConverter.convertToSchema(requiredRowType).toString(), inputSplits, - conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","), - conf.getString(FlinkOptions.MERGE_CLASS_NAME)); + conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")); return MergeOnReadInputFormat.builder() .config(this.conf) .tableState(hoodieTableState) diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java index 8adbde355cf7e..00fa1b34d8b2a 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java @@ -18,13 +18,17 @@ package org.apache.hudi.table.format; +import java.util.stream.Collectors; +import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.Functions; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; @@ -140,6 +144,7 @@ public static HoodieMergedLogRecordScanner logScanner( .withSpillableMapBasePath(writeConfig.getSpillableMapBasePath()) .withInstantRange(split.getInstantRange()) .withOperationField(flinkConf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)) + .withRecordMerger(writeConfig.getRecordMerger()) .build(); } @@ -149,6 +154,12 @@ private static HoodieUnMergedLogRecordScanner unMergedLogScanner( org.apache.flink.configuration.Configuration flinkConf, Configuration hadoopConf, HoodieUnMergedLogRecordScanner.LogRecordScannerCallback callback) { + List mergers = Arrays.stream(flinkConf.getString(FlinkOptions.RECORD_MERGER_IMPLS).split(",")) + .map(String::trim) + .distinct() + .collect(Collectors.toList()); + HoodieRecordMerger merger = HoodieRecordUtils.generateRecordMerger( + split.getTablePath(), EngineType.FLINK, mergers, flinkConf.getString(FlinkOptions.RECORD_MERGER_STRATEGY)); FileSystem fs = FSUtils.getFs(split.getTablePath(), hadoopConf); return HoodieUnMergedLogRecordScanner.newBuilder() .withFileSystem(fs) @@ -166,6 +177,7 @@ private static HoodieUnMergedLogRecordScanner unMergedLogScanner( HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) .withInstantRange(split.getInstantRange()) .withLogRecordScannerCallback(callback) + .withRecordMerger(merger) .build(); } @@ -243,6 +255,7 @@ public static HoodieMergedLogRecordScanner logScanner( .withSpillableMapBasePath(writeConfig.getSpillableMapBasePath()) .withDiskMapType(writeConfig.getCommonConfig().getSpillableDiskMapType()) .withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .withRecordMerger(writeConfig.getRecordMerger()) .build(); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java index 09f4984c770f2..776b20fde224c 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java @@ -18,11 +18,13 @@ package org.apache.hudi.table.format.mor; +import java.util.stream.Collectors; +import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.log.InstantRange; import org.apache.hudi.common.util.ClosableIterator; @@ -205,8 +207,7 @@ public void open(MergeOnReadInputSplit split) throws IOException { this.requiredPos, this.emitDelete, this.tableState.getOperationPos(), - getFullSchemaReader(split.getBasePath().get()), - tableState.getMergeClass()); + getFullSchemaReader(split.getBasePath().get())); } else { throw new HoodieException("Unable to select an Iterator to read the Hoodie MOR File Split for " + "file path: " + split.getBasePath() @@ -644,7 +645,7 @@ static class MergeIterator implements RecordIterator { private final InstantRange instantRange; - private final HoodieMerge merge; + private final HoodieRecordMerger recordMerger; // add the flag because the flink ParquetColumnarRowSplitReader is buggy: // method #reachedEnd() returns false after it returns true. @@ -668,8 +669,7 @@ static class MergeIterator implements RecordIterator { int[] requiredPos, boolean emitDelete, int operationPos, - ParquetColumnarRowSplitReader reader, // the reader should be with full schema - String mergeClass) { + ParquetColumnarRowSplitReader reader) { // the reader should be with full schema this.tableSchema = tableSchema; this.reader = reader; this.scanner = FormatUtils.logScanner(split, tableSchema, flinkConf, hadoopConf); @@ -684,7 +684,11 @@ static class MergeIterator implements RecordIterator { this.avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(requiredRowType); this.projection = RowDataProjection.instance(requiredRowType, requiredPos); this.instantRange = split.getInstantRange().orElse(null); - this.merge = HoodieRecordUtils.loadMerge(mergeClass); + List mergers = Arrays.stream(flinkConf.getString(FlinkOptions.RECORD_MERGER_IMPLS).split(",")) + .map(String::trim) + .distinct() + .collect(Collectors.toList()); + this.recordMerger = HoodieRecordUtils.generateRecordMerger(split.getTablePath(), EngineType.FLINK, mergers, flinkConf.getString(FlinkOptions.RECORD_MERGER_STRATEGY)); } @Override @@ -701,18 +705,18 @@ public boolean reachedEnd() throws IOException { final String curKey = currentRecord.getString(HOODIE_RECORD_KEY_COL_POS).toString(); if (scanner.getRecords().containsKey(curKey)) { keyToSkip.add(curKey); - Option mergedAvroRecord = mergeRowWithLog(currentRecord, curKey); + Option mergedAvroRecord = mergeRowWithLog(currentRecord, curKey); if (!mergedAvroRecord.isPresent()) { // deleted continue; } else { - final RowKind rowKind = FormatUtils.getRowKindSafely(mergedAvroRecord.get(), this.operationPos); + final RowKind rowKind = FormatUtils.getRowKindSafely(mergedAvroRecord.get().getData(), this.operationPos); if (!emitDelete && rowKind == RowKind.DELETE) { // deleted continue; } GenericRecord avroRecord = buildAvroRecordBySchema( - mergedAvroRecord.get(), + mergedAvroRecord.get().getData(), requiredSchema, requiredPos, recordBuilder); @@ -770,13 +774,14 @@ public void close() throws IOException { } } - private Option mergeRowWithLog( + private Option mergeRowWithLog( RowData curRow, String curKey) throws IOException { final HoodieAvroRecord record = (HoodieAvroRecord) scanner.getRecords().get(curKey); GenericRecord historyAvroRecord = (GenericRecord) rowDataToAvroConverter.convert(tableSchema, curRow); - Option resultRecord = merge.combineAndGetUpdateValue(new HoodieAvroIndexedRecord(historyAvroRecord), record, tableSchema, payloadProps); - return ((HoodieAvroIndexedRecord) resultRecord.get()).toIndexedRecord(); + HoodieAvroIndexedRecord hoodieAvroIndexedRecord = new HoodieAvroIndexedRecord(historyAvroRecord); + Option resultRecord = recordMerger.merge(hoodieAvroIndexedRecord, record, tableSchema, payloadProps); + return resultRecord.get().toIndexedRecord(tableSchema, new Properties()); } } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadTableState.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadTableState.java index bbb21db7f8d28..36dfecbb79a5f 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadTableState.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadTableState.java @@ -41,7 +41,6 @@ public class MergeOnReadTableState implements Serializable { private final List inputSplits; private final String[] pkFields; private final int operationPos; - private final String mergeClass; public MergeOnReadTableState( RowType rowType, @@ -49,8 +48,7 @@ public MergeOnReadTableState( String avroSchema, String requiredAvroSchema, List inputSplits, - String[] pkFields, - String mergeClass) { + String[] pkFields) { this.rowType = rowType; this.requiredRowType = requiredRowType; this.avroSchema = avroSchema; @@ -58,7 +56,6 @@ public MergeOnReadTableState( this.inputSplits = inputSplits; this.pkFields = pkFields; this.operationPos = rowType.getFieldIndex(HoodieRecord.OPERATION_METADATA_FIELD); - this.mergeClass = mergeClass; } public RowType getRowType() { @@ -85,10 +82,6 @@ public int getOperationPos() { return operationPos; } - public String getMergeClass() { - return mergeClass; - } - public int[] getRequiredPositions() { final List fieldNames = rowType.getFieldNames(); return requiredRowType.getFieldNames().stream() diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java index 85ea097c6431c..f844f08c67028 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java @@ -26,6 +26,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCleaningPolicy; @@ -47,7 +48,6 @@ import org.apache.hudi.config.HoodieLockConfig; import org.apache.hudi.config.HoodieMemoryConfig; import org.apache.hudi.config.HoodiePayloadConfig; -import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.HadoopConfigurations; @@ -195,7 +195,6 @@ public static HoodieWriteConfig getHoodieClientConfig( .archiveCommitsWith(conf.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS), conf.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS)) .build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withMergeClass(conf.getString(FlinkOptions.MERGE_CLASS_NAME)) .withTargetIOPerCompactionInMB(conf.getLong(FlinkOptions.COMPACTION_TARGET_IO)) .withInlineCompactionTriggerStrategy( CompactionTriggerStrategy.valueOf(conf.getString(FlinkOptions.COMPACTION_TRIGGER_STRATEGY).toUpperCase(Locale.ROOT))) @@ -232,6 +231,8 @@ public static HoodieWriteConfig getHoodieClientConfig( .withEmbeddedTimelineServerReuseEnabled(true) // make write client embedded timeline service singleton .withAutoCommit(false) .withAllowOperationMetadataField(conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)) + .withMergerImpls(conf.getString(FlinkOptions.RECORD_MERGER_IMPLS)) + .withMergerStrategy(conf.getString(FlinkOptions.RECORD_MERGER_STRATEGY)) .withProps(flinkConf2TypedProperties(conf)) .withSchema(getSourceSchema(conf).toString()); @@ -310,7 +311,6 @@ public static HoodieTableMetaClient initTableIfNotExists( .setTableName(conf.getString(FlinkOptions.TABLE_NAME)) .setRecordKeyFields(conf.getString(FlinkOptions.RECORD_KEY_FIELD, null)) .setPayloadClassName(conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME)) - .setMergeClassName(conf.getString(FlinkOptions.MERGE_CLASS_NAME)) .setPreCombineField(OptionsResolver.getPreCombineField(conf)) .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) .setPartitionFields(conf.getString(FlinkOptions.PARTITION_PATH_FIELD, null)) diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java index f2095c1844059..63d5c1f6bdbf1 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java @@ -263,8 +263,7 @@ private OneInputStreamOperatorTestHarness create tableAvroSchema.toString(), AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE).toString(), Collections.emptyList(), - new String[0], - metaClient.getTableConfig().getMergeClass()); + new String[0]); MergeOnReadInputFormat inputFormat = MergeOnReadInputFormat.builder() .config(conf) .tableState(hoodieTableState) diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java index 0338895c14978..88d488a5bc073 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -31,6 +32,7 @@ import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.configuration.FlinkOptions; @@ -771,6 +773,7 @@ private static HoodieMergedLogRecordScanner getScanner( .withSpillableMapBasePath("/tmp/") .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileRecordReader.java index 4fa62e79eabd0..5b7e7fbc6774b 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileRecordReader.java @@ -56,7 +56,7 @@ public HoodieHFileRecordReader(Configuration conf, InputSplit split, JobConf job @Override public boolean next(NullWritable key, ArrayWritable value) throws IOException { if (recordIterator == null) { - recordIterator = reader.getRecordIterator(schema); + recordIterator = reader.getIndexedRecordIterator(schema); } if (!recordIterator.hasNext()) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java index ae550cb335a56..f2437eb6f60de 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java @@ -22,9 +22,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.FileSplit; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import java.io.DataInput; @@ -64,7 +65,8 @@ public static boolean readBoolean(DataInput in) throws IOException { public static Schema getBaseFileSchema(FileSplit split, Configuration conf) { try { if (split instanceof BootstrapBaseFileSplit) { - HoodieAvroFileReader storageReader = HoodieFileReaderFactory.getFileReader(conf, + // TODO mr support other record type + HoodieFileReader storageReader = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(conf, ((BootstrapBaseFileSplit)(split)).getBootstrapFileSplit().getPath()); return HoodieAvroUtils.addMetadataFields(storageReader.getSchema()); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java index 4fa3781aeaf4d..c2b8864bc3e29 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java @@ -27,8 +27,11 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; @@ -89,10 +92,11 @@ private HoodieMergedLogRecordScanner getMergedLogRecordScanner() throws IOExcept .withDiskMapType(jobConf.getEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue())) .withBitCaskDiskMapCompressionEnabled(jobConf.getBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue())) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); } - private Option buildGenericRecordwithCustomPayload(HoodieRecord record) throws IOException { + private Option buildGenericRecordwithCustomPayload(HoodieRecord record) throws IOException { if (usesCustomPayload) { return record.toIndexedRecord(getWriterSchema(), payloadProps); } else { @@ -112,7 +116,7 @@ public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOEx this.deltaRecordKeys.remove(key); // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the // deltaRecord may not be a full record and needs values of columns from the parquet - Option rec = buildGenericRecordwithCustomPayload(deltaRecordMap.get(key)); + Option rec = buildGenericRecordwithCustomPayload(deltaRecordMap.get(key)); // If the record is not present, this is a delete record using an empty payload so skip this base record // and move to the next record if (!rec.isPresent()) { @@ -129,7 +133,7 @@ public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOEx } while (this.deltaItr.hasNext()) { final String key = this.deltaItr.next(); - Option rec = buildGenericRecordwithCustomPayload(deltaRecordMap.get(key)); + Option rec = buildGenericRecordwithCustomPayload(deltaRecordMap.get(key)); if (rec.isPresent()) { setUpWritable(rec, arrayWritable, key); return true; @@ -138,12 +142,12 @@ public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOEx return false; } - private void setUpWritable(Option rec, ArrayWritable arrayWritable, String key) { - GenericRecord recordToReturn = rec.get(); + private void setUpWritable(Option rec, ArrayWritable arrayWritable, String key) { + GenericRecord recordToReturn = (GenericRecord) rec.get().getData(); if (usesCustomPayload) { // If using a custom payload, return only the projection fields. The readerSchema is a schema derived from // the writerSchema with only the projection fields - recordToReturn = HoodieAvroUtils.rewriteRecord(rec.get(), getReaderSchema()); + recordToReturn = HoodieAvroUtils.rewriteRecord((GenericRecord) rec.get().getData(), getReaderSchema()); } // we assume, a later safe record in the log, is newer than what we have in the map & // replace it. Since we want to return an arrayWritable which is the same length as the elements in the latest diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java index e418bebb50136..db187f18cf4bf 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java @@ -19,9 +19,11 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.Functions; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; @@ -92,10 +94,11 @@ public RealtimeUnmergedRecordReader(RealtimeSplit split, JobConf job, .withBufferSize(this.jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) .withLogRecordScannerCallback(record -> { // convert Hoodie log record to Hadoop AvroWritable and buffer - GenericRecord rec = (GenericRecord) record.toIndexedRecord(getReaderSchema(), payloadProps).get(); + GenericRecord rec = (GenericRecord) record.toIndexedRecord(getReaderSchema(), payloadProps).get().getData(); ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(rec, getHiveSchema()); this.executor.getQueue().insertRecord(aWritable); }) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); // Start reading and buffering this.executor.startProducers(); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java index 0e7dce2b0fc95..0a3b86f10967a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java @@ -41,11 +41,13 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; + +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; -import org.apache.hudi.io.storage.HoodieAvroFileReader; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -72,7 +74,8 @@ public class HoodieRealtimeRecordReaderUtils { */ public static Schema readSchema(Configuration conf, Path filePath) { try { - HoodieAvroFileReader storageReader = HoodieFileReaderFactory.getFileReader(conf, filePath); + // TODO mr support other record type + HoodieFileReader storageReader = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(conf, filePath); return storageReader.getSchema(); } catch (IOException e) { throw new HoodieIOException("Failed to read schema from " + filePath, e); diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java index 0be736b2a043d..7e571720c175a 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.hadoop.conf.Configuration; @@ -159,6 +160,7 @@ static HoodieTableMetaClient initTableType(Configuration hadoopConf, String base properties.setProperty(HoodieTableConfig.NAME.key(), tableName); properties.setProperty(HoodieTableConfig.TYPE.key(), tableType.name()); properties.setProperty(HoodieTableConfig.PAYLOAD_CLASS_NAME.key(), HoodieAvroPayload.class.getName()); + properties.setProperty(HoodieTableConfig.MERGER_STRATEGY.key(), StringUtils.DEFAULT_MERGER_STRATEGY_UUID); return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties); } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java index 733bdda275191..0644656ce4e40 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java @@ -22,20 +22,25 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; + import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.TypeUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieMemoryConfig; import org.apache.hudi.io.storage.HoodieAvroFileReader; @@ -262,9 +267,9 @@ private Iterator readColumnarOrLogFiles(FileSlice fileSlice) thro if (fileSlice.getBaseFile().isPresent()) { // Read the base files using the latest writer schema. Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); - HoodieAvroFileReader reader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), - new Path(fileSlice.getBaseFile().get().getPath())); - return reader.getRecordIterator(schema); + HoodieAvroFileReader reader = TypeUtils.unsafeCast(HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(metaClient.getHadoopConf(), + new Path(fileSlice.getBaseFile().get().getPath()))); + return reader.getIndexedRecordIterator(schema); } else { // If there is no data file, fall back to reading log files HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() @@ -283,6 +288,7 @@ private Iterator readColumnarOrLogFiles(FileSlice fileSlice) thro .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(HoodieAvroRecordMerger.class.getName())) .build(); // readAvro log files Iterable iterable = () -> scanner.iterator(); diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java index ee807f49dae89..34a300ae79abc 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -37,7 +37,7 @@ import org.apache.hudi.common.util.TablePathUtils; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodiePayloadConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieNotSupportedException; diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkRecordMerge.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkRecordMerger.java similarity index 51% rename from hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkRecordMerge.java rename to hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkRecordMerger.java index 88ae7c13df168..051a109a264f4 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkRecordMerge.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkRecordMerger.java @@ -16,33 +16,49 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi; +package org.apache.hudi; -import org.apache.avro.Schema; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieMerge; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; + +import org.apache.avro.Schema; import java.io.IOException; import java.util.Properties; -public class HoodieSparkRecordMerge implements HoodieMerge { +public class HoodieSparkRecordMerger implements HoodieRecordMerger { + + @Override + public String getMergingStrategy() { + return StringUtils.DEFAULT_MERGER_STRATEGY_UUID; + } @Override - public HoodieRecord preCombine(HoodieRecord older, HoodieRecord newer) { + public Option merge(HoodieRecord older, HoodieRecord newer, Schema schema, Properties props) throws IOException { + ValidationUtils.checkArgument(older.getRecordType() == HoodieRecordType.SPARK); + ValidationUtils.checkArgument(newer.getRecordType() == HoodieRecordType.SPARK); + + if (newer.getData() == null) { + // Delete record + return Option.empty(); + } if (older.getData() == null) { // use natural order for delete record - return older; + return Option.of(newer); } - if (older.getOrderingValue().compareTo(newer.getOrderingValue()) > 0) { - return older; + if (older.getOrderingValue(props).compareTo(newer.getOrderingValue(props)) > 0) { + return Option.of(older); } else { - return newer; + return Option.of(newer); } } @Override - public Option combineAndGetUpdateValue(HoodieRecord older, HoodieRecord newer, Schema schema, Properties props) throws IOException { - return Option.of(newer); + public HoodieRecordType getRecordType() { + return HoodieRecordType.SPARK; } } \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index d5d741595b91f..78173e847beb9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -302,10 +302,15 @@ object DataSourceWriteOptions { val PAYLOAD_CLASS_NAME = HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME /** - * HoodieMerge will replace the payload to process the merge of data + * HoodieMerger will replace the payload to process the merge of data * and provide the same capabilities as the payload */ - val MERGE_CLASS_NAME = HoodieWriteConfig.MERGE_CLASS_NAME + val MERGER_IMPLS = HoodieWriteConfig.MERGER_IMPLS + + /** + * Id of merger strategy + */ + val MERGER_STRATEGY = HoodieWriteConfig.MERGER_STRATEGY /** * Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala index 569ffa690b3e9..ece56a5159172 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala @@ -36,9 +36,10 @@ import org.apache.hudi.common.model.{FileSlice, HoodieFileFormat, HoodieRecord} import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} import org.apache.hudi.common.table.view.HoodieTableFileSystemView import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.common.util.{ConfigUtils, HoodieRecordUtils, StringUtils} import org.apache.hudi.common.util.StringUtils.isNullOrEmpty import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper} import org.apache.hudi.internal.schema.{HoodieSchemaException, InternalSchema} @@ -60,7 +61,6 @@ import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{HoodieCatalystExpressionUtils, Row, SQLContext, SparkSession} import org.apache.spark.unsafe.types.UTF8String - import java.net.URI import java.util.Locale @@ -79,7 +79,8 @@ case class HoodieTableState(tablePath: String, usesVirtualKeys: Boolean, recordPayloadClassName: String, metadataConfig: HoodieMetadataConfig, - mergeClass: String) + mergerImpls: String, + mergerStrategy: String) /** * Hoodie BaseRelation which extends [[PrunedFilteredScan]]. @@ -469,7 +470,10 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, usesVirtualKeys = !tableConfig.populateMetaFields(), recordPayloadClassName = tableConfig.getPayloadClass, metadataConfig = fileIndex.metadataConfig, - mergeClass = tableConfig.getMergeClass + mergerImpls = optParams.getOrElse(HoodieWriteConfig.MERGER_IMPLS.key(), + HoodieWriteConfig.MERGER_IMPLS.defaultValue()), + mergerStrategy = optParams.getOrElse(HoodieWriteConfig.MERGER_STRATEGY.key(), + metaClient.getTableConfig.getMergerStrategy) ) } @@ -746,7 +750,7 @@ object HoodieBaseRelation extends SparkAdapterSupport { val requiredAvroSchema = new Schema.Parser().parse(requiredDataSchema.avroSchemaStr) val avroToRowConverter = AvroConversionUtils.createAvroToInternalRowConverter(requiredAvroSchema, requiredRowSchema) - reader.getRecordIterator(requiredAvroSchema).asScala + reader.getIndexedRecordIterator(requiredAvroSchema).asScala .map(record => { avroToRowConverter.apply(record.asInstanceOf[GenericRecord]).get }) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala index d0032088a65f9..19711f4428337 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala @@ -18,7 +18,7 @@ package org.apache.hudi -import org.apache.avro.Schema +import org.apache.avro.{Schema, SchemaNormalization} import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -28,12 +28,12 @@ import org.apache.hudi.HoodieConversionUtils.{toJavaOption, toScalaOption} import org.apache.hudi.HoodieMergeOnReadRDD.SafeAvroProjection.collectFieldOrdinals import org.apache.hudi.HoodieMergeOnReadRDD._ import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig} -import org.apache.hudi.common.engine.HoodieLocalEngineContext +import org.apache.hudi.common.engine.{EngineType, HoodieLocalEngineContext} import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath -import org.apache.hudi.common.model.{HoodieAvroIndexedRecord, HoodieLogFile, HoodieRecord, HoodieRecordPayload, OverwriteWithLatestAvroPayload} +import org.apache.hudi.common.model.{HoodieAvroIndexedRecord, HoodieEmptyRecord, HoodieLogFile, HoodieRecord, HoodieRecordPayload, OverwriteWithLatestAvroPayload} import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner -import org.apache.hudi.common.util.HoodieRecordUtils +import org.apache.hudi.common.util.{HoodieRecordUtils, SerializationUtils} import org.apache.hudi.common.util.ValidationUtils.checkState import org.apache.hudi.config.HoodiePayloadConfig import org.apache.hudi.exception.HoodieException @@ -48,7 +48,11 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskContext} import java.io.Closeable +import java.nio.charset.StandardCharsets import java.util.Properties +import org.apache.hudi.commmon.model.HoodieSparkRecord +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType +import org.apache.spark.sql.hudi.SparkStructTypeSerializer import scala.annotation.tailrec import scala.collection.JavaConverters._ import scala.util.Try @@ -191,6 +195,8 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, private val requiredSchemaSafeAvroProjection = SafeAvroProjection.create(logFileReaderAvroSchema, requiredAvroSchema) + private val requiredSchemaSafeRowProjection = HoodieBaseRelation.generateUnsafeProjection(tableSchema.structTypeSchema, requiredSchema.structTypeSchema) + private var logScanner = { val internalSchema = tableSchema.internalSchema.getOrElse(InternalSchema.getEmptyInternalSchema) HoodieMergeOnReadRDD.scanLog(split.logFiles, getPartitionPath(split), logFileReaderAvroSchema, tableState, @@ -201,12 +207,13 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, // NOTE: This have to stay lazy to make sure it's initialized only at the point where it's // going to be used, since we modify `logRecords` before that and therefore can't do it any earlier - protected lazy val logRecordsIterator: Iterator[Option[GenericRecord]] = - logRecords.iterator.map { - case (_, record) => - toScalaOption(record.getData.asInstanceOf[HoodieRecordPayload[_]].getInsertValue(logFileReaderAvroSchema, payloadProps)) - .map(_.asInstanceOf[GenericRecord]) - } + protected lazy val logRecordsIterator: Iterator[Any] = logRecords.iterator.map { + case (_, record: HoodieSparkRecord) => record + case (_, _: HoodieEmptyRecord[_]) => Option.empty + case (_, record) => + toScalaOption(record.getData.asInstanceOf[HoodieRecordPayload[_]].getInsertValue(logFileReaderAvroSchema, payloadProps)) + .map(_.asInstanceOf[GenericRecord]) + } protected def removeLogRecord(key: String): Option[HoodieRecord[_]] = logRecords.remove(key) @@ -218,14 +225,15 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, // handling records @tailrec private def hasNextInternal: Boolean = { logRecordsIterator.hasNext && { - val avroRecordOpt = logRecordsIterator.next() - if (avroRecordOpt.isEmpty) { - // Record has been deleted, skipping - this.hasNextInternal - } else { - val projectedAvroRecord = requiredSchemaSafeAvroProjection(avroRecordOpt.get) - recordToLoad = deserialize(projectedAvroRecord) - true + logRecordsIterator.next() match { + case Some(r: GenericRecord) => + val projectedAvroRecord = requiredSchemaSafeAvroProjection(r) + recordToLoad = deserialize(projectedAvroRecord) + true + case None => this.hasNextInternal + case r: HoodieSparkRecord => + recordToLoad = requiredSchemaSafeRowProjection(r.getData) + true } } } @@ -294,7 +302,9 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, private val baseFileIterator = baseFileReader(split.dataFile.get) - private val merger = HoodieRecordUtils.loadMerge(tableState.mergeClass) + val mergerList = tableState.mergerImpls.split(",") + .map(_.trim).distinct.toList.asJava + private val recordMerger = HoodieRecordUtils.generateRecordMerger(tableState.tablePath, EngineType.SPARK, mergerList, tableState.mergerStrategy) override def hasNext: Boolean = hasNextInternal @@ -311,14 +321,12 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, recordToLoad = requiredSchemaUnsafeProjection(curRow) true } else { - val mergedAvroRecordOpt = merge(serialize(curRow), updatedRecordOpt.get) - if (mergedAvroRecordOpt.isEmpty) { + val mergedRecordOpt = merge(curRow, updatedRecordOpt.get) + if (mergedRecordOpt.isEmpty) { // Record has been deleted, skipping this.hasNextInternal } else { - val projectedAvroRecord = projectAvroUnsafe(mergedAvroRecordOpt.get.getData.asInstanceOf[GenericRecord], - requiredAvroSchema, reusableRecordBuilder) - recordToLoad = deserialize(projectedAvroRecord) + recordToLoad = mergedRecordOpt.get true } } @@ -330,10 +338,26 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, private def serialize(curRowRecord: InternalRow): GenericRecord = serializer.serialize(curRowRecord).asInstanceOf[GenericRecord] - private def merge(curAvroRecord: GenericRecord, newRecord: HoodieRecord[_]): Option[HoodieRecord[_]] = { + private def merge(curRow: InternalRow, newRecord: HoodieRecord[_]): Option[InternalRow] = { // NOTE: We have to pass in Avro Schema used to read from Delta Log file since we invoke combining API // on the record from the Delta Log - toScalaOption(merger.combineAndGetUpdateValue(new HoodieAvroIndexedRecord(curAvroRecord), newRecord, logFileReaderAvroSchema, payloadProps)) + val curRecord = recordMerger.getRecordType match { + case HoodieRecordType.SPARK => + new HoodieSparkRecord(curRow, baseFileReader.schema) + case _ => + new HoodieAvroIndexedRecord(serialize(curRow)) + } + recordMerger.getRecordType match { + case HoodieRecordType.SPARK => + toScalaOption(recordMerger.merge(curRecord, newRecord, logFileReaderAvroSchema, payloadProps)) + .map(r => { + val projection = HoodieInternalRowUtils.getCachedUnsafeProjection(r.asInstanceOf[HoodieSparkRecord].getStructType, requiredStructTypeSchema) + projection.apply(r.getData.asInstanceOf[InternalRow]) + }) + case _ => + toScalaOption(recordMerger.merge(curRecord, newRecord, logFileReaderAvroSchema, payloadProps)) + .map(r => deserialize(projectAvroUnsafe(r.toIndexedRecord(logFileReaderAvroSchema, new Properties()).get().getData.asInstanceOf[GenericRecord], requiredAvroSchema, reusableRecordBuilder))) + } } } } @@ -401,6 +425,15 @@ private object HoodieMergeOnReadRDD { getRelativePartitionPath(new Path(tableState.tablePath), logFiles.head.getPath.getParent)) } + val mergerList = tableState.mergerImpls.split(",") + .map(_.trim).distinct.toList.asJava + val recordMerger = HoodieRecordUtils.generateRecordMerger(tableState.tablePath, EngineType.SPARK, mergerList, tableState.mergerStrategy) + logRecordScannerBuilder.withRecordMerger(recordMerger) + + if (recordMerger.getRecordType == HoodieRecordType.SPARK) { + registerStructTypeSerializerIfNeed(List(HoodieInternalRowUtils.getCachedSchema(logSchema))) + } + logRecordScannerBuilder.build() } } @@ -420,6 +453,13 @@ private object HoodieMergeOnReadRDD { .getParent } + private def registerStructTypeSerializerIfNeed(schemas: List[StructType]): Unit = { + val schemaMap = schemas.map(schema => (SchemaNormalization.fingerprint64(schema.json.getBytes(StandardCharsets.UTF_8)), schema)) + .toMap + val serializer = new SparkStructTypeSerializer(schemaMap) + SerializationUtils.setOverallRegister(classOf[HoodieSparkRecord].getName, serializer) + } + // TODO extract to HoodieAvroSchemaUtils abstract class AvroProjection extends (GenericRecord => GenericRecord) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 1e1b05dd69acc..d7a035cfd2b50 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -26,12 +26,15 @@ import org.apache.hudi.HoodieConversionUtils.{toProperties, toScalaOption} import org.apache.hudi.HoodieWriterUtils._ import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.client.{HoodieWriteResult, SparkRDDWriteClient} +import org.apache.hudi.commmon.model.HoodieSparkRecord import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieConfig, HoodieMetadataConfig, TypedProperties} import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import org.apache.hudi.common.model._ +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType import org.apache.hudi.common.table.timeline.HoodieActiveTimeline import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.util.{CommitUtils, StringUtils} +import org.apache.hudi.common.util.{CommitUtils, ConfigUtils, StringUtils} import org.apache.hudi.config.HoodieBootstrapConfig.{BASE_PATH, INDEX_CLASS_NAME, KEYGEN_CLASS_NAME} import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig} import org.apache.hudi.exception.HoodieException @@ -43,17 +46,18 @@ import org.apache.hudi.internal.schema.InternalSchema import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter import org.apache.hudi.internal.schema.utils.{AvroSchemaEvolutionUtils, SerDeHelper} import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory -import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} +import org.apache.hudi.keygen.{SparkKeyGeneratorInterface, TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} import org.apache.hudi.metrics.Metrics import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.hudi.sync.common.util.SyncUtilHelpers import org.apache.hudi.table.BulkInsertPartitioner import org.apache.hudi.util.SparkKeyGenUtils import org.apache.log4j.LogManager -import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.internal.StaticSQLConf +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType import org.apache.spark.{SPARK_VERSION, SparkContext} @@ -152,7 +156,6 @@ object HoodieSparkSqlWriter { .setBaseFileFormat(baseFileFormat) .setArchiveLogFolder(archiveLogFolder) .setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_NAME)) - .setMergeClassName(hoodieConfig.getString(MERGE_CLASS_NAME)) // we can't fetch preCombine field from hoodieConfig object, since it falls back to "ts" as default value, // but we are interested in what user has set, hence fetching from optParams. .setPreCombineField(optParams.getOrElse(PRECOMBINE_FIELD.key(), null)) @@ -284,28 +287,6 @@ object HoodieSparkSqlWriter { log.info(s"Registered avro schema : ${writerSchema.toString(true)}") // Convert to RDD[HoodieRecord] - val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, structName, nameSpace, reconcileSchema, - org.apache.hudi.common.util.Option.of(writerSchema)) - val shouldCombine = parameters(INSERT_DROP_DUPS.key()).toBoolean || - operation.equals(WriteOperationType.UPSERT) || - parameters.getOrElse(HoodieWriteConfig.COMBINE_BEFORE_INSERT.key(), - HoodieWriteConfig.COMBINE_BEFORE_INSERT.defaultValue()).toBoolean - val hoodieAllIncomingRecords = genericRecords.map(gr => { - val processedRecord = getProcessedRecord(partitionColumns, gr, dropPartitionColumns) - val hoodieRecord = if (shouldCombine) { - val orderingVal = HoodieAvroUtils.getNestedFieldVal(gr, hoodieConfig.getString(PRECOMBINE_FIELD), false, parameters.getOrElse( - DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), - DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()).toBoolean) - .asInstanceOf[Comparable[_]] - DataSourceUtils.createHoodieRecord(processedRecord, - orderingVal, - keyGenerator.getKey(gr), - hoodieConfig.getString(PAYLOAD_CLASS_NAME)) - } else { - DataSourceUtils.createHoodieRecord(processedRecord, keyGenerator.getKey(gr), hoodieConfig.getString(PAYLOAD_CLASS_NAME)) - } - hoodieRecord - }).toJavaRDD() val writerDataSchema = if (dropPartitionColumns) generateSchemaWithoutPartitionColumns(partitionColumns, writerSchema) else writerSchema // Create a HoodieWriteClient & issue the write. @@ -313,6 +294,12 @@ object HoodieSparkSqlWriter { val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writerDataSchema.toString, path, tblName, mapAsJavaMap(addSchemaEvolutionParameters(parameters, internalSchemaOpt) - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key) )).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] + val writeConfig = client.getConfig + if (writeConfig.getRecordMerger.getRecordType == HoodieRecordType.SPARK && writeConfig.getLogDataBlockFormat.orElse(HoodieLogBlockType.AVRO_DATA_BLOCK) != HoodieLogBlockType.PARQUET_DATA_BLOCK) { + throw new UnsupportedOperationException(s"${writeConfig.getRecordMerger.getClass.getName} only support parquet log.") + } + // Create a HoodieWriteClient & issue the write. + val hoodieAllIncomingRecords = createHoodieRecordRdd(df, writeConfig, parameters, writerSchema) if (isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration())) { asyncCompactionTriggerFn.get.apply(client) @@ -351,8 +338,15 @@ object HoodieSparkSqlWriter { HoodieAvroUtils.removeFields(schema, fieldsToRemove) } - def getProcessedRecord(partitionParam: String, record: GenericRecord, - dropPartitionColumns: Boolean): GenericRecord = { + def generateSparkSchemaWithoutPartitionColumns(partitionParam: String, schema: StructType): StructType = { + val fieldsToRemove = new java.util.ArrayList[String]() + partitionParam.split(",").map(partitionField => partitionField.trim) + .filter(s => s.nonEmpty).map(field => fieldsToRemove.add(field)) + HoodieInternalRowUtils.removeFields(schema, fieldsToRemove) + } + + def getAvroProcessedRecord(partitionParam: String, record: GenericRecord, + dropPartitionColumns: Boolean): GenericRecord = { var processedRecord = record if (dropPartitionColumns) { val writeSchema = generateSchemaWithoutPartitionColumns(partitionParam, record.getSchema) @@ -361,6 +355,17 @@ object HoodieSparkSqlWriter { processedRecord } + def getSparkProcessedRecord(partitionParam: String, record: InternalRow, + dropPartitionColumns: Boolean, schema: StructType): (InternalRow, StructType) = { + var processedRecord = record + var writeSchema = schema + if (dropPartitionColumns) { + writeSchema = generateSparkSchemaWithoutPartitionColumns(partitionParam, schema) + processedRecord = HoodieInternalRowUtils.rewriteRecord(record, schema, writeSchema) + } + (processedRecord, writeSchema) + } + def addSchemaEvolutionParameters(parameters: Map[String, String], internalSchemaOpt: Option[InternalSchema]): Map[String, String] = { val schemaEvolutionEnable = if (internalSchemaOpt.isDefined) "true" else "false" parameters ++ Map(HoodieWriteConfig.INTERNAL_SCHEMA_STRING.key() -> SerDeHelper.toJson(internalSchemaOpt.getOrElse(null)), @@ -490,7 +495,6 @@ object HoodieSparkSqlWriter { .setRecordKeyFields(recordKeyFields) .setArchiveLogFolder(archiveLogFolder) .setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_NAME)) - .setMergeClassName(hoodieConfig.getStringOrDefault(MERGE_CLASS_NAME)) .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD, null)) .setBootstrapIndexClass(bootstrapIndexClass) .setBaseFileFormat(baseFileFormat) @@ -829,4 +833,56 @@ object HoodieSparkSqlWriter { Map.empty } } + + private def createHoodieRecordRdd(df: DataFrame, config: HoodieWriteConfig, parameters: Map[String, String], schema: Schema): JavaRDD[HoodieRecord[_]] = { + val reconcileSchema = parameters(DataSourceWriteOptions.RECONCILE_SCHEMA.key()).toBoolean + val tblName = config.getString(HoodieWriteConfig.TBL_NAME) + val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName) + val shouldCombine = parameters(INSERT_DROP_DUPS.key()).toBoolean || + WriteOperationType.fromValue(config.getString(OPERATION)).equals(WriteOperationType.UPSERT) || + parameters.getOrElse(HoodieWriteConfig.COMBINE_BEFORE_INSERT.key(), + HoodieWriteConfig.COMBINE_BEFORE_INSERT.defaultValue()).toBoolean + val precombineField = config.getString(PRECOMBINE_FIELD) + val keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps)) + val partitionCols = SparkKeyGenUtils.getPartitionColumns(keyGenerator, toProperties(parameters)) + val dropPartitionColumns = config.getBoolean(DataSourceWriteOptions.DROP_PARTITION_COLUMNS) + config.getRecordMerger.getRecordType match { + case HoodieRecord.HoodieRecordType.AVRO => + val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, structName, nameSpace, reconcileSchema, + org.apache.hudi.common.util.Option.of(schema)) + genericRecords.map(gr => { + val processedRecord = getAvroProcessedRecord(partitionCols, gr, dropPartitionColumns) + val hoodieRecord = if (shouldCombine) { + val orderingVal = HoodieAvroUtils.getNestedFieldVal(gr, precombineField, false, parameters.getOrElse( + DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()).toBoolean) + .asInstanceOf[Comparable[_]] + DataSourceUtils.createHoodieRecord(processedRecord, + orderingVal, + keyGenerator.getKey(gr), + config.getString(PAYLOAD_CLASS_NAME)) + } else { + DataSourceUtils.createHoodieRecord(processedRecord, keyGenerator.getKey(gr), config.getString(PAYLOAD_CLASS_NAME)) + } + hoodieRecord + }).toJavaRDD() + case HoodieRecord.HoodieRecordType.SPARK => + // ut will use AvroKeyGenerator, so we need to cast it in spark record + val sparkKeyGenerator = keyGenerator.asInstanceOf[SparkKeyGeneratorInterface] + val structType = HoodieInternalRowUtils.getCachedSchema(schema) + val structTypeBC = SparkContext.getOrCreate().broadcast(structType) + HoodieInternalRowUtils.addCompressedSchema(structType) + df.queryExecution.toRdd.map(row => { + val internalRow = row.copy() + val (processedRow, writeSchema) = getSparkProcessedRecord(partitionCols, internalRow, dropPartitionColumns, structTypeBC.value) + val recordKey = sparkKeyGenerator.getRecordKey(internalRow, structTypeBC.value) + val partitionPath = sparkKeyGenerator.getPartitionPath(internalRow, structTypeBC.value) + val key = new HoodieKey(recordKey.toString, partitionPath.toString) + HoodieInternalRowUtils.addCompressedSchema(structTypeBC.value) + + new HoodieSparkRecord(key, processedRow, writeSchema) + }).toJavaRDD().asInstanceOf[JavaRDD[HoodieRecord[_]]] + } + } + } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala index aa29bad6b03d0..50276d74d9b86 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala @@ -18,7 +18,7 @@ package org.apache.hudi import org.apache.hudi.DataSourceOptionsHelper.allAlternatives -import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.DataSourceWriteOptions.{MERGER_IMPLS, _} import org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieCommonConfig, HoodieConfig} import org.apache.hudi.common.table.HoodieTableConfig @@ -29,7 +29,6 @@ import org.apache.hudi.sync.common.HoodieSyncConfig import org.apache.hudi.util.SparkKeyGenUtils import org.apache.spark.sql.SparkSession import org.apache.spark.sql.hudi.command.SqlKeyGenerator - import java.util.Properties import scala.collection.JavaConversions.mapAsJavaMap import scala.collection.JavaConverters._ @@ -218,7 +217,8 @@ object HoodieWriterUtils { PRECOMBINE_FIELD -> HoodieTableConfig.PRECOMBINE_FIELD, PARTITIONPATH_FIELD -> HoodieTableConfig.PARTITION_FIELDS, RECORDKEY_FIELD -> HoodieTableConfig.RECORDKEY_FIELDS, - PAYLOAD_CLASS_NAME -> HoodieTableConfig.PAYLOAD_CLASS_NAME + PAYLOAD_CLASS_NAME -> HoodieTableConfig.PAYLOAD_CLASS_NAME, + MERGER_STRATEGY -> HoodieTableConfig.MERGER_STRATEGY ) def mappingSparkDatasourceConfigsToTableConfigs(options: Map[String, String]): Map[String, String] = { val includingTableConfigs = scala.collection.mutable.Map() ++ options diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala index 0fe62da0ded36..66495e0e242c2 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala @@ -333,7 +333,7 @@ object DataSkippingUtils extends Logging { } } -private object ColumnStatsExpressionUtils { +object ColumnStatsExpressionUtils { @inline def genColMinValueExpr(colName: String): Expression = col(getMinColumnNameFor(colName)).expr @inline def genColMaxValueExpr(colName: String): Expression = col(getMaxColumnNameFor(colName)).expr diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala index 732367cf5a5e5..8cd77900c0ebe 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql.hudi import org.apache.hudi.DataSourceWriteOptions import org.apache.hudi.avro.HoodieAvroUtils.getRootLevelFieldName +import org.apache.hudi.common.model.HoodieAvroRecordMerger import org.apache.hudi.common.table.HoodieTableConfig -import org.apache.hudi.common.util.ValidationUtils +import org.apache.hudi.common.util.{StringUtils, ValidationUtils} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType @@ -69,6 +70,19 @@ object HoodieOptionConfig { .defaultValue(DataSourceWriteOptions.PAYLOAD_CLASS_NAME.defaultValue()) .build() + val SQL_MERGER_IMPLS: HoodieSQLOption[String] = buildConf() + .withSqlKey("mergerImpls") + .withHoodieKey(DataSourceWriteOptions.MERGER_IMPLS.key) + .defaultValue(classOf[HoodieAvroRecordMerger].getName) + .build() + + val SQL_MERGER_STRATEGY: HoodieSQLOption[String] = buildConf() + .withSqlKey("mergerStrategy") + .withHoodieKey(DataSourceWriteOptions.MERGER_STRATEGY.key) + .withTableConfigKey(HoodieTableConfig.MERGER_STRATEGY.key) + .defaultValue(StringUtils.DEFAULT_MERGER_STRATEGY_UUID) + .build() + /** * The mapping of the sql short name key to the hoodie's config key. */ @@ -185,7 +199,7 @@ object HoodieOptionConfig { // extract primaryKey, preCombineField, type options def extractSqlOptions(options: Map[String, String]): Map[String, String] = { val sqlOptions = mappingTableConfigToSqlOption(options) - val targetOptions = keyMapping.keySet -- Set(SQL_PAYLOAD_CLASS.sqlKeyName) + val targetOptions = keyMapping.keySet -- Set(SQL_PAYLOAD_CLASS.sqlKeyName) -- Set(SQL_MERGER_IMPLS.sqlKeyName) -- Set(SQL_MERGER_STRATEGY.sqlKeyName) sqlOptions.filterKeys(targetOptions.contains) } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala index 798ed84b0939c..194f657acf459 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala @@ -27,9 +27,9 @@ import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StructType, TimestampType} import org.joda.time.format.DateTimeFormat - import java.sql.Timestamp import java.util.concurrent.TimeUnit.{MICROSECONDS, MILLISECONDS} +import org.apache.spark.sql.catalyst.InternalRow /** * A complex key generator for sql command which do some process for the diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java index bc732a1401a4f..cad5499b35e91 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java @@ -24,21 +24,30 @@ import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.commmon.model.HoodieSparkRecord; import org.apache.hudi.common.bootstrap.FileStatusUtils; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.SparkKeyGeneratorInterface; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.broadcast.Broadcast; import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; import java.io.IOException; import java.util.List; @@ -55,32 +64,50 @@ public SparkFullBootstrapDataProviderBase(TypedProperties props, @Override public JavaRDD generateInputRecords(String tableName, String sourceBasePath, - List>> partitionPathsWithFiles) { + List>> partitionPathsWithFiles, HoodieWriteConfig config) { String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue) .flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())) .toArray(String[]::new); // NOTE: "basePath" option is required for spark to discover the partition column // More details at https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery + HoodieRecordType recordType = config.getRecordMerger().getRecordType(); Dataset inputDataset = sparkSession.read().format(getFormat()).option("basePath", sourceBasePath).load(filePaths); try { KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props); + String precombineKey = props.getString("hoodie.datasource.write.precombine.field"); String structName = tableName + "_record"; String namespace = "hoodie." + tableName; - RDD genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, - Option.empty()); - return genericRecords.toJavaRDD().map(gr -> { - String orderingVal = HoodieAvroUtils.getNestedFieldValAsString( - gr, props.getString("hoodie.datasource.write.precombine.field"), false, props.getBoolean( - KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), - Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()))); - try { - return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), - props.getString("hoodie.datasource.write.payload.class")); - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } - }); + if (recordType == HoodieRecordType.AVRO) { + RDD genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, + Option.empty()); + return genericRecords.toJavaRDD().map(gr -> { + String orderingVal = HoodieAvroUtils.getNestedFieldValAsString( + gr, precombineKey, false, props.getBoolean( + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()))); + try { + return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), + props.getString("hoodie.datasource.write.payload.class")); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + }); + } else if (recordType == HoodieRecordType.SPARK) { + SparkKeyGeneratorInterface sparkKeyGenerator = (SparkKeyGeneratorInterface) keyGenerator; + StructType structType = inputDataset.schema(); + Broadcast structTypeBC = new JavaSparkContext(sparkSession.sparkContext()).broadcast(structType); + return inputDataset.queryExecution().toRdd().toJavaRDD().map(row -> { + InternalRow internalRow = row.copy(); + String recordKey = sparkKeyGenerator.getRecordKey(internalRow, structTypeBC.value()).toString(); + String partitionPath = sparkKeyGenerator.getPartitionPath(internalRow, structTypeBC.value()).toString(); + HoodieKey key = new HoodieKey(recordKey, partitionPath); + return new HoodieSparkRecord(key, internalRow, structTypeBC.value()); + }); + } else { + throw new UnsupportedOperationException(recordType.name()); + } + } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkRecord.java b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkRecord.java deleted file mode 100644 index a22e78af21e3c..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkRecord.java +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.hudi; - -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieOperation; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.keygen.BaseKeyGenerator; - -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; - -import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Properties; - -import static org.apache.spark.sql.types.DataTypes.BooleanType; - -/** - * Spark Engine-specific Implementations of `HoodieRecord`. - */ -public class HoodieSparkRecord extends HoodieRecord { - - public HoodieSparkRecord(HoodieKey key, InternalRow data, Comparable orderingVal) { - super(key, data, orderingVal); - } - - public HoodieSparkRecord(HoodieKey key, InternalRow data, HoodieOperation operation, Comparable orderingVal) { - super(key, data, operation, orderingVal); - } - - public HoodieSparkRecord(HoodieRecord record) { - super(record); - } - - public HoodieSparkRecord() { - } - - @Override - public HoodieRecord newInstance() { - return new HoodieSparkRecord(this); - } - - @Override - public HoodieRecord newInstance(HoodieKey key, HoodieOperation op) { - return new HoodieSparkRecord(key, data, op, getOrderingValue()); - } - - @Override - public HoodieRecord newInstance(HoodieKey key) { - return new HoodieSparkRecord(key, data, getOrderingValue()); - } - - @Override - public String getRecordKey(Option keyGeneratorOpt) { - return getRecordKey(); - } - - @Override - public String getRecordKey(String keyFieldName) { - return getRecordKey(); - } - - @Override - public HoodieRecord mergeWith(HoodieRecord other, Schema readerSchema, Schema writerSchema) throws IOException { - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(readerSchema); - StructType writerStructType = HoodieInternalRowUtils.getCachedSchema(writerSchema); - InternalRow mergeRow = HoodieInternalRowUtils.stitchRecords(data, readerStructType, (InternalRow) other.getData(), readerStructType, writerStructType); - return new HoodieSparkRecord(getKey(), mergeRow, getOperation()); - } - - @Override - public HoodieRecord rewriteRecord(Schema recordSchema, Schema targetSchema, TypedProperties props) throws IOException { - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(recordSchema); - StructType targetStructType = HoodieInternalRowUtils.getCachedSchema(targetSchema); - InternalRow rewriteRow = HoodieInternalRowUtils.rewriteRecord(data, readerStructType, targetStructType); - return new HoodieSparkRecord(getKey(), rewriteRow, getOperation()); - } - - @Override - public HoodieRecord rewriteRecord(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields) throws IOException { - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(recordSchema); - StructType writeSchemaWithMetaFieldsStructType = HoodieInternalRowUtils.getCachedSchema(writeSchemaWithMetaFields); - InternalRow rewriteRow = schemaOnReadEnabled ? HoodieInternalRowUtils.rewriteRecordWithNewSchema(data, readerStructType, writeSchemaWithMetaFieldsStructType, new HashMap<>()) - : HoodieInternalRowUtils.rewriteRecord(data, readerStructType, writeSchemaWithMetaFieldsStructType); - return new HoodieSparkRecord(getKey(), rewriteRow, getOperation()); - } - - @Override - public HoodieRecord rewriteRecordWithMetadata(Schema recordSchema, Properties prop, boolean schemaOnReadEnabled, Schema writeSchemaWithMetaFields, String fileName) throws IOException { - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(recordSchema); - StructType writeSchemaWithMetaFieldsStructType = HoodieInternalRowUtils.getCachedSchema(writeSchemaWithMetaFields); - InternalRow rewriteRow = schemaOnReadEnabled ? HoodieInternalRowUtils.rewriteEvolutionRecordWithMetadata(data, readerStructType, writeSchemaWithMetaFieldsStructType, fileName) - : HoodieInternalRowUtils.rewriteRecordWithMetadata(data, readerStructType, writeSchemaWithMetaFieldsStructType, fileName); - return new HoodieSparkRecord(getKey(), rewriteRow, getOperation()); - } - - @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols) throws IOException { - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(recordSchema); - StructType newStructType = HoodieInternalRowUtils.getCachedSchema(newSchema); - InternalRow rewriteRow = HoodieInternalRowUtils.rewriteRecordWithNewSchema(data, readerStructType, newStructType, renameCols); - return new HoodieSparkRecord(getKey(), rewriteRow, getOperation()); - } - - @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema, Map renameCols, Mapper mapper) throws IOException { - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(recordSchema); - StructType newStructType = HoodieInternalRowUtils.getCachedSchema(newSchema); - InternalRow rewriteRow = HoodieInternalRowUtils.rewriteRecordWithNewSchema(data, readerStructType, newStructType, renameCols); - // TODO change mapper type - return mapper.apply((IndexedRecord) rewriteRow); - } - - @Override - public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties prop, Schema newSchema) throws IOException { - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(recordSchema); - StructType newStructType = HoodieInternalRowUtils.getCachedSchema(newSchema); - InternalRow rewriteRow = HoodieInternalRowUtils.rewriteRecord(data, readerStructType, newStructType); - return new HoodieSparkRecord(getKey(), rewriteRow, getOperation()); - } - - @Override - public HoodieRecord overrideMetadataFieldValue(Schema recordSchema, Properties prop, int pos, String newValue) throws IOException { - data.update(pos, newValue); - return this; - } - - @Override - public HoodieRecord addMetadataValues(Schema recordSchema, Properties prop, Map metadataValues) throws IOException { - Arrays.stream(HoodieMetadataField.values()).forEach(metadataField -> { - String value = metadataValues.get(metadataField); - if (value != null) { - data.update(recordSchema.getField(metadataField.getFieldName()).pos(), value); - } - }); - return this; - } - - @Override - public Option> getMetadata() { - return Option.empty(); - } - - @Override - public boolean isPresent(Schema schema, Properties prop) throws IOException { - if (null == data) { - return false; - } - Object deleteMarker = data.get(schema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD).pos(), BooleanType); - return !(deleteMarker instanceof Boolean && (boolean) deleteMarker); - } - - @Override - public boolean shouldIgnore(Schema schema, Properties prop) throws IOException { - // TODO SENTINEL should refactor without Avro(GenericRecord) - if (null != data && data.equals(SENTINEL)) { - return true; - } else { - return false; - } - } - - @Override - public Option toIndexedRecord(Schema schema, Properties prop) throws IOException { - throw new UnsupportedOperationException(); - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala index 2421face72ad9..b5be1c7d7d40e 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala @@ -18,15 +18,15 @@ package org.apache.spark.sql.hudi import org.apache.avro.Schema -import org.apache.avro.generic.IndexedRecord import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hudi.avro.HoodieAvroWriteSupport import org.apache.hudi.client.SparkTaskContextSupplier import org.apache.hudi.common.bloom.{BloomFilter, BloomFilterFactory} +import org.apache.hudi.common.config.HoodieStorageConfig import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord} import org.apache.hudi.common.util.BaseFileUtils -import org.apache.hudi.config.{HoodieIndexConfig, HoodieStorageConfig} +import org.apache.hudi.config.HoodieIndexConfig import org.apache.hudi.io.storage.{HoodieAvroParquetWriter, HoodieParquetConfig} import org.apache.parquet.avro.AvroSchemaConverter import org.apache.parquet.hadoop.metadata.CompressionCodecName diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala index ad21c11e9b056..55a282e773086 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala @@ -17,14 +17,14 @@ package org.apache.spark.sql.hudi.command.procedures -import org.apache.avro.generic.{GenericRecord, IndexedRecord} +import org.apache.avro.generic.GenericRecord import org.apache.avro.specific.SpecificData import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hudi.HoodieCLIUtils import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.avro.model.HoodieArchivedMetaEntry import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.model.{HoodieAvroIndexedRecord, HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.model.HoodieLogFile import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.log.HoodieLogFormat import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock @@ -35,11 +35,11 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} - import java.io.File import java.util import java.util.Collections import java.util.function.Supplier +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import scala.collection.JavaConverters._ import scala.util.control.Breaks.break @@ -124,10 +124,7 @@ class ExportInstantsProcedure extends BaseProcedure with ProcedureBuilder with L }) { val blk = reader.next.asInstanceOf[HoodieAvroDataBlock] try { - val mapper = new HoodieRecord.Mapper() { - override def apply(data: IndexedRecord) = new HoodieAvroIndexedRecord(data) - } - val recordItr = blk.getRecordIterator(mapper) + val recordItr = blk.getRecordIterator(HoodieRecordType.AVRO) try while ( { recordItr.hasNext }) { diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileMetadataProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileMetadataProcedure.scala index 415c642d95d5e..3553caef515fc 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileMetadataProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileMetadataProcedure.scala @@ -19,9 +19,8 @@ package org.apache.spark.sql.hudi.command.procedures import com.fasterxml.jackson.databind.ObjectMapper import org.apache.hadoop.fs.Path -import org.apache.avro.generic.IndexedRecord import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.model.{HoodieAvroIndexedRecord, HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.model.HoodieLogFile import org.apache.hudi.common.table.log.HoodieLogFormat import org.apache.hudi.common.table.log.block.HoodieLogBlock.{HeaderMetadataType, HoodieLogBlockType} import org.apache.hudi.common.table.log.block.{HoodieCorruptBlock, HoodieDataBlock} @@ -29,10 +28,10 @@ import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.parquet.avro.AvroSchemaConverter import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} - import java.util.Objects import java.util.concurrent.atomic.AtomicInteger import java.util.function.Supplier +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import scala.collection.JavaConverters.{asScalaBufferConverter, asScalaIteratorConverter, mapAsScalaMapConverter} class ShowHoodieLogFileMetadataProcedure extends BaseProcedure with ProcedureBuilder { @@ -94,10 +93,7 @@ class ShowHoodieLogFileMetadataProcedure extends BaseProcedure with ProcedureBui } block match { case dataBlock: HoodieDataBlock => - val mapper = new HoodieRecord.Mapper() { - override def apply(data: IndexedRecord) = new HoodieAvroIndexedRecord(data) - } - val recordItr = dataBlock.getRecordIterator(mapper) + val recordItr = dataBlock.getRecordIterator(HoodieRecordType.AVRO) recordItr.asScala.foreach(_ => recordCount.incrementAndGet()) recordItr.close() } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileRecordsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileRecordsProcedure.scala index ecee96bc46016..9c3e2e469b0fd 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileRecordsProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileRecordsProcedure.scala @@ -21,18 +21,18 @@ import org.apache.avro.generic.IndexedRecord import org.apache.hadoop.fs.Path import org.apache.hudi.common.config.HoodieCommonConfig import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.model.{HoodieAvroIndexedRecord, HoodieLogFile, HoodieRecord, HoodieRecordPayload} +import org.apache.hudi.common.model.{HoodieAvroRecordMerger, HoodieLogFile, HoodieRecordPayload} import org.apache.hudi.common.table.log.block.HoodieDataBlock import org.apache.hudi.common.table.log.{HoodieLogFormat, HoodieMergedLogRecordScanner} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.util.ValidationUtils +import org.apache.hudi.common.util.{HoodieRecordUtils, ValidationUtils} import org.apache.hudi.config.{HoodieCompactionConfig, HoodieMemoryConfig} import org.apache.parquet.avro.AvroSchemaConverter import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} - import java.util.Objects import java.util.function.Supplier +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import scala.collection.JavaConverters._ class ShowHoodieLogFileRecordsProcedure extends BaseProcedure with ProcedureBuilder { @@ -76,6 +76,7 @@ class ShowHoodieLogFileRecordsProcedure extends BaseProcedure with ProcedureBuil .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue) .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue) + .withRecordMerger(HoodieRecordUtils.loadRecordMerger(classOf[HoodieAvroRecordMerger].getName)) .build scanner.asScala.foreach(hoodieRecord => { val record = hoodieRecord.getData.asInstanceOf[HoodieRecordPayload[_]].getInsertValue(schema).get() @@ -92,10 +93,7 @@ class ShowHoodieLogFileRecordsProcedure extends BaseProcedure with ProcedureBuil val block = reader.next() block match { case dataBlock: HoodieDataBlock => - val mapper = new HoodieRecord.Mapper() { - override def apply(data: IndexedRecord) = new HoodieAvroIndexedRecord(data) - } - val recordItr = dataBlock.getRecordIterator(mapper) + val recordItr = dataBlock.getRecordIterator(HoodieRecordType.AVRO) recordItr.asScala.foreach(record => { if (allRecords.size() < limit) { allRecords.add(record.getData.asInstanceOf[IndexedRecord]) diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 11f0fc97856e9..26786e08bbc6c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -27,7 +28,6 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.config.HoodieClusteringConfig; -import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java index 93b25f8a6542f..8dc45ca908426 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java @@ -489,7 +489,7 @@ public TestFullBootstrapDataProvider(TypedProperties props, HoodieSparkEngineCon @Override public JavaRDD generateInputRecords(String tableName, String sourceBasePath, - List>> partitionPaths) { + List>> partitionPaths, HoodieWriteConfig config) { String filePath = FileStatusUtils.toPath(partitionPaths.stream().flatMap(p -> p.getValue().stream()) .findAny().get().getPath()).toString(); ParquetFileReader reader = null; diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java index 330b6015bc625..24c681bf04f34 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java @@ -379,7 +379,7 @@ public TestFullBootstrapDataProvider(TypedProperties props, HoodieSparkEngineCon @Override public JavaRDD generateInputRecords(String tableName, String sourceBasePath, - List>> partitionPaths) { + List>> partitionPaths, HoodieWriteConfig config) { String[] filePaths = partitionPaths.stream().map(Pair::getValue) .flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())) .toArray(String[]::new); diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala index ddf7aa351c7a7..1bb81f7f92eea 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala @@ -209,6 +209,7 @@ class TestDataSourceDefaults extends ScalaAssertionSupport { val STRUCT_NAME: String = "hoodieRowTopLevelField" val NAMESPACE: String = "hoodieRow" var converterFn: Function1[Row, GenericRecord] = _ + var internalConverterFn: Function1[InternalRow, GenericRecord] = _ override def getKey(record: GenericRecord): HoodieKey = { new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true, false), diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala index 19027a47bfabc..847095567f58e 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala @@ -23,7 +23,7 @@ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieFileIndex.DataSkippingFailureMode import org.apache.hudi.client.HoodieJavaWriteClient import org.apache.hudi.client.common.HoodieJavaEngineContext -import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig} import org.apache.hudi.common.engine.EngineType import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType} import org.apache.hudi.common.table.view.HoodieTableFileSystemView @@ -33,11 +33,12 @@ import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils} import org.apache.hudi.common.util.PartitionPathEncodeUtils import org.apache.hudi.common.util.StringUtils.isNullOrEmpty -import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig} +import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.keygen.ComplexKeyGenerator import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.TimestampType import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.testutils.HoodieClientTestBase + import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, GreaterThanOrEqual, LessThan, Literal} import org.apache.spark.sql.execution.datasources.{NoopCache, PartitionDirectory} @@ -48,8 +49,8 @@ import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.{BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{Arguments, CsvSource, MethodSource, ValueSource} - import java.util.Properties + import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ import scala.util.Random diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieInternalRowUtils.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieInternalRowUtils.scala index 7a08ee64bf6ef..624033b67f288 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieInternalRowUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieInternalRowUtils.scala @@ -20,8 +20,8 @@ package org.apache.hudi import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.testutils.HoodieClientTestUtils + import org.apache.spark.api.java.JavaSparkContext -import org.apache.spark.sql.hudi.HoodieInternalRowUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SparkSession} import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestStructTypeSchemaEvolutionUtils.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestStructTypeSchemaEvolutionUtils.scala index cb5529721cf67..bd2bda59ee5c3 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestStructTypeSchemaEvolutionUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestStructTypeSchemaEvolutionUtils.scala @@ -20,6 +20,7 @@ package org.apache.hudi import java.nio.ByteBuffer import java.util.{ArrayList, HashMap, Objects} + import org.apache.avro.generic.GenericData import org.apache.avro.{LogicalTypes, Schema} import org.apache.hudi.avro.HoodieAvroUtils @@ -28,11 +29,11 @@ import org.apache.hudi.internal.schema.action.TableChanges import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter import org.apache.hudi.internal.schema.utils.SchemaChangeUtils import org.apache.hudi.testutils.HoodieClientTestUtils + import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} -import org.apache.spark.sql.hudi.HoodieInternalRowUtils import org.apache.spark.sql.types._ import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala index 4de1a5943ba16..3bc7c6870fdc9 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala @@ -20,7 +20,7 @@ package org.apache.hudi.functional import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hudi.HoodieConversionUtils.toJavaOption import org.apache.hudi.QuickstartUtils.{convertToStringList, getQuickstartWriteConfigs} -import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig} import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.table.timeline.HoodieInstant import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} @@ -36,7 +36,7 @@ import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.metrics.Metrics import org.apache.hudi.testutils.HoodieClientTestBase import org.apache.hudi.util.JFunction -import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, QuickstartUtils} +import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, QuickstartUtils, HoodieSparkRecordMerger} import org.apache.spark.sql._ import org.apache.spark.sql.functions.{col, concat, lit, udf} import org.apache.spark.sql.hudi.HoodieSparkSessionExtension @@ -47,10 +47,10 @@ import org.junit.jupiter.api.Assertions.{assertEquals, assertThrows, assertTrue, import org.junit.jupiter.api.function.Executable import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest -import org.junit.jupiter.params.provider.{CsvSource, ValueSource} - +import org.junit.jupiter.params.provider.{CsvSource, EnumSource} import java.sql.{Date, Timestamp} import java.util.function.Consumer +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ @@ -72,6 +72,10 @@ class TestCOWDataSource extends HoodieClientTestBase { HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS.key -> "1" ) + val sparkOpts = Map( + HoodieWriteConfig.MERGER_IMPLS.key -> classOf[HoodieSparkRecordMerger].getName, + HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT.key -> "parquet" + ) val verificationCol: String = "driver" val updatedVerificationVal: String = "driver_update" @@ -98,12 +102,16 @@ class TestCOWDataSource extends HoodieClientTestBase { System.gc() } - @Test def testShortNameStorage() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testShortNameStorage(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // Insert Operation val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) inputDF.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Overwrite) .save(basePath) @@ -111,7 +119,11 @@ class TestCOWDataSource extends HoodieClientTestBase { assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) } - @Test def testNoPrecombine() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testNoPrecombine(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // Insert Operation val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) @@ -122,17 +134,21 @@ class TestCOWDataSource extends HoodieClientTestBase { DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" - ) + ) ++ writeOpts inputDF.write.format("hudi") .options(commonOptsNoPreCombine) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Overwrite) .save(basePath) - spark.read.format("org.apache.hudi").load(basePath).count() + spark.read.format("org.apache.hudi").options(readOpts).load(basePath).count() } - @Test def testHoodieIsDeletedNonBooleanField() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testHoodieIsDeletedNonBooleanField(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // Insert Operation val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) @@ -141,7 +157,7 @@ class TestCOWDataSource extends HoodieClientTestBase { assertThrows(classOf[HoodieException], new Executable { override def execute(): Unit = { df.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Overwrite) .save(basePath) } @@ -155,8 +171,11 @@ class TestCOWDataSource extends HoodieClientTestBase { * * For COW table, test the snapshot query mode and incremental query mode. */ - @Test - def testPrunePartitionForTimestampBasedKeyGenerator(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testPrunePartitionForTimestampBasedKeyGenerator(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val options = commonOpts ++ Map( "hoodie.compact.inline" -> "false", DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, @@ -165,7 +184,7 @@ class TestCOWDataSource extends HoodieClientTestBase { Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP -> "yyyy/MM/dd", Config.TIMESTAMP_TIMEZONE_FORMAT_PROP -> "GMT+8:00", Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP -> "yyyy-MM-dd" - ) + ) ++ writeOpts val dataGen1 = new HoodieTestDataGenerator(Array("2022-01-01")) val records1 = recordsToStrings(dataGen1.generateInserts("001", 20)).toList @@ -190,7 +209,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val commit2Time = metaClient.reloadActiveTimeline.lastInstant().get().getTimestamp // snapshot query - val snapshotQueryRes = spark.read.format("hudi").load(basePath) + val snapshotQueryRes = spark.read.format("hudi").options(readOpts).load(basePath) // TODO(HUDI-3204) we have to revert this to pre-existing behavior from 0.10 //assertEquals(snapshotQueryRes.where("partition = '2022-01-01'").count, 20) //assertEquals(snapshotQueryRes.where("partition = '2022-01-02'").count, 30) @@ -199,6 +218,7 @@ class TestCOWDataSource extends HoodieClientTestBase { // incremental query val incrementalQueryRes = spark.read.format("hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit1Time) .option(DataSourceReadOptions.END_INSTANTTIME.key, commit2Time) @@ -215,15 +235,18 @@ class TestCOWDataSource extends HoodieClientTestBase { * archival should kick in and 2 commits should be archived. If schema is valid, no exception will be thrown. If not, * NPE will be thrown. */ - @Test - def testArchivalWithBulkInsert(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testArchivalWithBulkInsert(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + var structType : StructType = null for (i <- 1 to 4) { val records = recordsToStrings(dataGen.generateInserts("%05d".format(i), 100)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) structType = inputDF.schema inputDF.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.keep.min.commits", "1") .option("hoodie.keep.max.commits", "2") .option("hoodie.cleaner.commits.retained", "0") @@ -244,19 +267,23 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals(schema, actualSchema) } - @Test - def testCopyOnWriteDeletes(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testCopyOnWriteDeletes(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + // Insert Operation val records1 = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Overwrite) .save(basePath) assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) val snapshotDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*/*/*") assertEquals(100, snapshotDF1.count()) @@ -264,21 +291,26 @@ class TestCOWDataSource extends HoodieClientTestBase { val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2 , 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) val snapshotDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*/*/*") assertEquals(snapshotDF2.count(), 80) } - @Test def testOverWriteModeUseReplaceAction(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testOverWriteModeUseReplaceAction(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val records1 = recordsToStrings(dataGen.generateInserts("001", 5)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) @@ -286,7 +318,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val records2 = recordsToStrings(dataGen.generateInserts("002", 5)).toList val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) @@ -300,12 +332,15 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals("replacecommit", commits(1)) } - @Test - def testReadPathsOnCopyOnWriteTable(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testReadPathsOnCopyOnWriteTable(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val records1 = dataGen.generateInsertsContainsAllPartitions("001", 20) val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) @@ -319,13 +354,14 @@ class TestCOWDataSource extends HoodieClientTestBase { val records2 = dataGen.generateInsertsContainsAllPartitions("002", 20) val inputDF2 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records2), 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) // Use bulk insert here to make sure the files have different file groups. .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) val hudiReadPathDF = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.READ_PATHS.key, record1FilePaths) .load() @@ -333,11 +369,15 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals(expectedCount, hudiReadPathDF.count()) } - @Test def testOverWriteTableModeUseReplaceAction(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testOverWriteTableModeUseReplaceAction(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val records1 = recordsToStrings(dataGen.generateInserts("001", 5)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) @@ -345,7 +385,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val records2 = recordsToStrings(dataGen.generateInserts("002", 5)).toList val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL) .mode(SaveMode.Overwrite) .save(basePath) @@ -359,12 +399,16 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals("replacecommit", commits(1)) } - @Test def testOverWriteModeUseReplaceActionOnDisJointPartitions(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testOverWriteModeUseReplaceActionOnDisJointPartitions(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + // step1: Write 5 records to hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH val records1 = recordsToStrings(dataGen.generateInsertsForPartition("001", 5, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) @@ -373,7 +417,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val records2 = recordsToStrings(dataGen.generateInsertsForPartition("002", 7, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).toList val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) @@ -382,12 +426,12 @@ class TestCOWDataSource extends HoodieClientTestBase { val records3 = recordsToStrings(dataGen.generateInsertsForPartition("001", 6, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList val inputDF3 = spark.read.json(spark.sparkContext.parallelize(records3, 2)) inputDF3.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) - val allRecords = spark.read.format("org.apache.hudi").load(basePath + "/*/*/*") + val allRecords = spark.read.format("org.apache.hudi").options(readOpts).load(basePath + "/*/*/*") allRecords.registerTempTable("tmpTable") spark.sql(String.format("select count(*) from tmpTable")).show() @@ -415,12 +459,16 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals("replacecommit", commits(2)) } - @Test def testOverWriteTableModeUseReplaceActionOnDisJointPartitions(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testOverWriteTableModeUseReplaceActionOnDisJointPartitions(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + // step1: Write 5 records to hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH val records1 = recordsToStrings(dataGen.generateInsertsForPartition("001", 5, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) @@ -429,12 +477,12 @@ class TestCOWDataSource extends HoodieClientTestBase { val records2 = recordsToStrings(dataGen.generateInsertsForPartition("002", 7, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).toList val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL) .mode(SaveMode.Overwrite) .save(basePath) - val allRecords = spark.read.format("org.apache.hudi").load(basePath + "/*/*/*") + val allRecords = spark.read.format("org.apache.hudi").options(readOpts).load(basePath + "/*/*/*") allRecords.registerTempTable("tmpTable") spark.sql(String.format("select count(*) from tmpTable")).show() @@ -465,7 +513,11 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals("replacecommit", commits(1)) } - @Test def testDropInsertDup(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testDropInsertDup(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val insert1Cnt = 10 val insert2DupKeyCnt = 9 val insert2NewKeyCnt = 2 @@ -479,11 +531,12 @@ class TestCOWDataSource extends HoodieClientTestBase { val records1 = recordsToStrings(inserts1).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .mode(SaveMode.Overwrite) .save(basePath) val hoodieROViewDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*/*/*") assertEquals(insert1Cnt, hoodieROViewDF1.count()) @@ -491,22 +544,28 @@ class TestCOWDataSource extends HoodieClientTestBase { val records2 = recordsToStrings(inserts2Dup ++ inserts2New).toList val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "true") .mode(SaveMode.Append) .save(basePath) val hoodieROViewDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*/*/*") assertEquals(hoodieROViewDF2.count(), totalUniqueKeyToGenerate) val hoodieIncViewDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime1) .load(basePath) assertEquals(hoodieIncViewDF2.count(), insert2NewKeyCnt) } - @Test def testComplexDataTypeWriteAndReadConsistency(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testComplexDataTypeWriteAndReadConsistency(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val schema = StructType(StructField("_row_key", StringType, true) :: StructField("name", StringType, true) :: StructField("timeStampValue", TimestampType, true) :: StructField("dateValue", DateType, true) :: StructField("decimalValue", DataTypes.createDecimalType(15, 10), true) :: StructField("timestamp", IntegerType, true) @@ -518,11 +577,12 @@ class TestCOWDataSource extends HoodieClientTestBase { val rdd = jsc.parallelize(records) val recordsDF = spark.createDataFrame(rdd, schema) recordsDF.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Overwrite) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*") recordsReadDF.printSchema() recordsReadDF.schema.foreach(f => { @@ -538,11 +598,15 @@ class TestCOWDataSource extends HoodieClientTestBase { }) } - @Test def testWithAutoCommitOn(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testWithAutoCommitOn(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val records1 = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(HoodieWriteConfig.AUTO_COMMIT_ENABLE.key, "true") .mode(SaveMode.Overwrite) @@ -551,51 +615,58 @@ class TestCOWDataSource extends HoodieClientTestBase { assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) } - private def getDataFrameWriter(keyGenerator: String): DataFrameWriter[Row] = { + private def getDataFrameWriter(keyGenerator: String, opts: Map[String, String]): DataFrameWriter[Row] = { val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) inputDF.write.format("hudi") - .options(commonOpts) + .options(opts) .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, keyGenerator) .mode(SaveMode.Overwrite) } - @Test def testSparkPartitionByWithCustomKeyGenerator(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testSparkPartitionByWithCustomKeyGenerator(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + // Without fieldType, the default is SIMPLE - var writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) + var writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, writeOpts) writer.partitionBy("current_ts") .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*") assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("current_ts").cast("string")).count() == 0) // Specify fieldType as TIMESTAMP - writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) + writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, writeOpts) writer.partitionBy("current_ts:TIMESTAMP") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") .mode(SaveMode.Overwrite) .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*") val udf_date_format = udf((data: Long) => new DateTime(data).toString(DateTimeFormat.forPattern("yyyyMMdd"))) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= udf_date_format(col("current_ts"))).count() == 0) // Mixed fieldType - writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) + writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, writeOpts) writer.partitionBy("driver", "rider:SIMPLE", "current_ts:TIMESTAMP") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") .mode(SaveMode.Overwrite) .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*/*") assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= concat(col("driver"), lit("/"), col("rider"), lit("/"), udf_date_format(col("current_ts")))).count() == 0) // Test invalid partitionKeyType - writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) + writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, writeOpts) writer = writer.partitionBy("current_ts:DUMMY") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") @@ -608,47 +679,62 @@ class TestCOWDataSource extends HoodieClientTestBase { } } - @Test def testSparkPartitionByWithSimpleKeyGenerator() { + @Test + def testSparkPartitionByWithSimpleKeyGenerator() { + val (writeOpts, readOpts) = getOpts(HoodieRecordType.AVRO) + // Use the `driver` field as the partition key - var writer = getDataFrameWriter(classOf[SimpleKeyGenerator].getName) + var writer = getDataFrameWriter(classOf[SimpleKeyGenerator].getName, writeOpts) writer.partitionBy("driver") .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("driver")).count() == 0) // Use the `driver,rider` field as the partition key, If no such field exists, // the default value [[PartitionPathEncodeUtils#DEFAULT_PARTITION_PATH]] is used - writer = getDataFrameWriter(classOf[SimpleKeyGenerator].getName) + writer = getDataFrameWriter(classOf[SimpleKeyGenerator].getName, writeOpts) writer.partitionBy("driver", "rider") .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit(DEFAULT_PARTITION_PATH)).count() == 0) } - @Test def testSparkPartitionByWithComplexKeyGenerator() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testSparkPartitionByWithComplexKeyGenerator(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // Use the `driver` field as the partition key - var writer = getDataFrameWriter(classOf[ComplexKeyGenerator].getName) + var writer = getDataFrameWriter(classOf[ComplexKeyGenerator].getName, writeOpts) writer.partitionBy("driver") .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("driver")).count() == 0) // Use the `driver`,`rider` field as the partition key - writer = getDataFrameWriter(classOf[ComplexKeyGenerator].getName) + writer = getDataFrameWriter(classOf[ComplexKeyGenerator].getName, writeOpts) writer.partitionBy("driver", "rider") .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= concat(col("driver"), lit("/"), col("rider"))).count() == 0) } - @Test def testSparkPartitionByWithTimestampBasedKeyGenerator() { - val writer = getDataFrameWriter(classOf[TimestampBasedKeyGenerator].getName) + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testSparkPartitionByWithTimestampBasedKeyGenerator(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + + val writer = getDataFrameWriter(classOf[TimestampBasedKeyGenerator].getName, writeOpts) writer.partitionBy("current_ts") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") @@ -656,51 +742,68 @@ class TestCOWDataSource extends HoodieClientTestBase { .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*") val udf_date_format = udf((data: Long) => new DateTime(data).toString(DateTimeFormat.forPattern("yyyyMMdd"))) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= udf_date_format(col("current_ts"))).count() == 0) } - @Test def testSparkPartitionByWithGlobalDeleteKeyGenerator() { - val writer = getDataFrameWriter(classOf[GlobalDeleteKeyGenerator].getName) + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testSparkPartitionByWithGlobalDeleteKeyGenerator(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + + val writer = getDataFrameWriter(classOf[GlobalDeleteKeyGenerator].getName, writeOpts) writer.partitionBy("driver") .mode(SaveMode.Overwrite) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("")).count() == 0) } - @Test def testSparkPartitionByWithNonpartitionedKeyGenerator() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testSparkPartitionByWithNonpartitionedKeyGenerator(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // Empty string column - var writer = getDataFrameWriter(classOf[NonpartitionedKeyGenerator].getName) + var writer = getDataFrameWriter(classOf[NonpartitionedKeyGenerator].getName, writeOpts) writer.partitionBy("") .mode(SaveMode.Overwrite) .save(basePath) var recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("")).count() == 0) // Non-existent column - writer = getDataFrameWriter(classOf[NonpartitionedKeyGenerator].getName) + writer = getDataFrameWriter(classOf[NonpartitionedKeyGenerator].getName, writeOpts) writer.partitionBy("abc") .mode(SaveMode.Overwrite) .save(basePath) recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= lit("")).count() == 0) } @ParameterizedTest - @CsvSource(Array("true,false", "true,true", "false,true", "false,false")) - def testQueryCOWWithBasePathAndFileIndex(partitionEncode: Boolean, isMetadataEnabled: Boolean): Unit = { + @CsvSource(Array( + "true,false,AVRO", "true,true,AVRO", "false,true,AVRO", "false,false,AVRO", + "true,false,SPARK", "true,true,SPARK", "false,true,SPARK", "false,false,SPARK" + )) + def testQueryCOWWithBasePathAndFileIndex(partitionEncode: Boolean, isMetadataEnabled: Boolean, recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val N = 20 // Test query with partition prune if URL_ENCODE_PARTITIONING has enable val records1 = dataGen.generateInsertsContainsAllPartitions("000", N) val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2)) inputDF1.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode) .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) @@ -711,6 +814,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val countIn20160315 = records1.asScala.count(record => record.getPartitionPath == "2016/03/15") // query the partition by filter val count1 = spark.read.format("hudi") + .options(readOpts) .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) .load(basePath) .filter("partition = '2016/03/15'") @@ -720,6 +824,7 @@ class TestCOWDataSource extends HoodieClientTestBase { // query the partition by path val partitionPath = if (partitionEncode) "2016%2F03%2F15" else "2016/03/15" val count2 = spark.read.format("hudi") + .options(readOpts) .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) .load(basePath + s"/$partitionPath") .count() @@ -729,7 +834,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val records2 = dataGen.generateInsertsContainsAllPartitions("000", N + 1) val inputDF2 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records2), 2)) inputDF2.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode) .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) @@ -737,6 +842,7 @@ class TestCOWDataSource extends HoodieClientTestBase { .save(basePath) // Incremental query without "*" in path val hoodieIncViewDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime1) .load(basePath) @@ -744,10 +850,14 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals(false, Metrics.isInitialized) } - @Test def testSchemaEvolution(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testSchemaEvolution(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + // open the schema validate val opts = commonOpts ++ Map("hoodie.avro.schema.validate" -> "true") ++ - Map(DataSourceWriteOptions.RECONCILE_SCHEMA.key() -> "true") + Map(DataSourceWriteOptions.RECONCILE_SCHEMA.key() -> "true") ++ writeOpts // 1. write records with schema1 val schema1 = StructType(StructField("_row_key", StringType, true) :: StructField("name", StringType, false):: StructField("timestamp", IntegerType, true) :: StructField("partition", IntegerType, true)::Nil) @@ -775,6 +885,7 @@ class TestCOWDataSource extends HoodieClientTestBase { .mode(SaveMode.Append) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*") val tableMetaClient = HoodieTableMetaClient.builder().setConf(spark.sparkContext.hadoopConfiguration).setBasePath(basePath).build() val actualSchema = new TableSchemaResolver(tableMetaClient).getTableAvroSchemaWithoutMetadataFields @@ -803,7 +914,11 @@ class TestCOWDataSource extends HoodieClientTestBase { } } - @Test def testSchemaNotEqualData(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testSchemaNotEqualData(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val opts = commonOpts ++ Map("hoodie.avro.schema.validate" -> "true") val schema1 = StructType(StructField("_row_key", StringType, true) :: StructField("name", StringType, true):: StructField("timestamp", IntegerType, true):: StructField("age", StringType, true) :: StructField("partition", IntegerType, true)::Nil) @@ -815,39 +930,46 @@ class TestCOWDataSource extends HoodieClientTestBase { .mode(SaveMode.Overwrite) .save(basePath) val recordsReadDF = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath) val resultSchema = new StructType(recordsReadDF.schema.filter(p=> !p.name.startsWith("_hoodie")).toArray) assertEquals(resultSchema, schema1) } @ParameterizedTest - @ValueSource(booleans = Array(true, false)) - def testCopyOnWriteWithDroppedPartitionColumns(enableDropPartitionColumns: Boolean) { + @CsvSource(Array("true, AVRO", "false, AVRO", "true, SPARK", "false, SPARK")) + def testCopyOnWriteWithDroppedPartitionColumns(enableDropPartitionColumns: Boolean, recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + val records1 = recordsToStrings(dataGen.generateInsertsContainsAllPartitions("000", 100)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.DROP_PARTITION_COLUMNS.key, enableDropPartitionColumns) .mode(SaveMode.Overwrite) .save(basePath) - val snapshotDF1 = spark.read.format("org.apache.hudi").load(basePath) + val snapshotDF1 = spark.read.format("org.apache.hudi").options(readOpts).load(basePath) assertEquals(snapshotDF1.count(), 100) assertEquals(3, snapshotDF1.select("partition").distinct().count()) } - @Test - def testHoodieIsDeletedCOW(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testHoodieIsDeletedCOW(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val numRecords = 100 val numRecordsToDelete = 2 val records0 = recordsToStrings(dataGen.generateInserts("000", numRecords)).toList val df0 = spark.read.json(spark.sparkContext.parallelize(records0, 2)) df0.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Overwrite) .save(basePath) val snapshotDF0 = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*/*/*") assertEquals(numRecords, snapshotDF0.count()) @@ -855,10 +977,11 @@ class TestCOWDataSource extends HoodieClientTestBase { val dropDf = df1.drop(df1.columns.filter(_.startsWith("_hoodie_")): _*) val df2 = dropDf.withColumn("_hoodie_is_deleted", lit(true).cast(BooleanType)) df2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val snapshotDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .load(basePath + "/*/*/*/*") assertEquals(numRecords - numRecordsToDelete, snapshotDF2.count()) } @@ -889,8 +1012,10 @@ class TestCOWDataSource extends HoodieClientTestBase { } @ParameterizedTest - @ValueSource(booleans = Array(true, false)) - def testPartitionColumnsProperHandling(useGlobbing: Boolean): Unit = { + @CsvSource(Array("true, AVRO", "false, AVRO", "true, SPARK", "false, SPARK")) + def testPartitionColumnsProperHandling(useGlobbing: Boolean, recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val _spark = spark import _spark.implicits._ @@ -898,7 +1023,7 @@ class TestCOWDataSource extends HoodieClientTestBase { .toDF("id", "name", "age", "ts", "data_date") df.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.insert.shuffle.parallelism", "4") .option("hoodie.upsert.shuffle.parallelism", "4") .option("hoodie.bulkinsert.shuffle.parallelism", "2") @@ -923,7 +1048,7 @@ class TestCOWDataSource extends HoodieClientTestBase { } // Case #1: Partition columns are read from the data file - val firstDF = spark.read.format("hudi").load(path) + val firstDF = spark.read.format("hudi").options(readOpts).load(path) assert(firstDF.count() == 2) @@ -947,6 +1072,7 @@ class TestCOWDataSource extends HoodieClientTestBase { // won't be able to infer partitioning properly if (!useGlobbing) { val secondDF = spark.read.format("hudi") + .options(readOpts) .option(DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key, "true") .load(path) @@ -1050,4 +1176,19 @@ class TestCOWDataSource extends HoodieClientTestBase { assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) assertEquals(false, Metrics.isInitialized, "Metrics should be shutdown") } + + def getOpts(recordType: HoodieRecordType): (Map[String, String], Map[String, String]) = { + val writeOpts = if (recordType == HoodieRecordType.SPARK) { + commonOpts ++ sparkOpts + } else { + commonOpts + } + val readOpts = if (recordType == HoodieRecordType.SPARK) { + sparkOpts + } else { + Map.empty[String, String] + } + + (writeOpts, readOpts) + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index 2c17bb8cddefd..b62cd1a524bb8 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -23,14 +23,15 @@ import org.apache.hadoop.fs.{LocatedFileStatus, Path} import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD} import org.apache.hudi.HoodieConversionUtils.toProperties -import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig} import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ParquetUtils -import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig} +import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.functional.TestColumnStatsIndex.ColumnStatsTestCase import org.apache.hudi.testutils.HoodieClientTestBase import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions} + import org.apache.spark.sql._ import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.types._ @@ -41,6 +42,7 @@ import org.junit.jupiter.params.provider.{Arguments, MethodSource, ValueSource} import java.math.BigInteger import java.sql.{Date, Timestamp} + import scala.collection.JavaConverters._ import scala.util.Random diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala index 87af8c668c9e8..b88301b833a71 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala @@ -20,7 +20,7 @@ package org.apache.hudi.functional import org.apache.hadoop.fs.Path import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieConversionUtils.toJavaOption -import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig} import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieTableType} import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.testutils.HoodieTestDataGenerator @@ -32,7 +32,7 @@ import org.apache.hudi.keygen.NonpartitionedKeyGenerator import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.testutils.{DataSourceTestUtils, HoodieClientTestBase} import org.apache.hudi.util.JFunction -import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, SparkDatasetMixin} +import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieSparkRecordMerger, SparkDatasetMixin} import org.apache.log4j.LogManager import org.apache.spark.sql._ import org.apache.spark.sql.functions._ @@ -41,9 +41,10 @@ import org.apache.spark.sql.types.BooleanType import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest -import org.junit.jupiter.params.provider.CsvSource - +import org.junit.jupiter.params.provider.{CsvSource, EnumSource} import java.util.function.Consumer +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType +import org.apache.hudi.hadoop.config.HoodieRealtimeConfig import scala.collection.JavaConverters._ /** @@ -61,6 +62,10 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp", HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" ) + val sparkOpts = Map( + HoodieWriteConfig.MERGER_IMPLS.key -> classOf[HoodieSparkRecordMerger].getName, + HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT.key -> "parquet" + ) val verificationCol: String = "driver" val updatedVerificationVal: String = "driver_update" @@ -86,14 +91,18 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { JFunction.toJava((receiver: SparkSessionExtensions) => new HoodieSparkSessionExtension().apply(receiver))) ) - @Test def testCount() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testCount(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // First Operation: // Producing parquet files to three default partitions. // SNAPSHOT view on MOR table with parquet files only. val records1 = recordsToStrings(dataGen.generateInserts("001", 100)).asScala val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) @@ -101,6 +110,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .save(basePath) assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) val hudiSnapshotDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(100, hudiSnapshotDF1.count()) // still 100, since we only updated @@ -111,10 +121,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records2 = recordsToStrings(dataGen.generateUniqueUpdates("002", 100)).asScala val inputDF2: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(100, hudiSnapshotDF2.count()) // still 100, since we only updated @@ -127,6 +138,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // incremental view // base file only val hudiIncDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000") .option(DataSourceReadOptions.END_INSTANTTIME.key, commit1Time) @@ -137,6 +149,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { hudiIncDF1.show(1) // log file only val hudiIncDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit1Time) .option(DataSourceReadOptions.END_INSTANTTIME.key, commit2Time) @@ -148,6 +161,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // base file + log file val hudiIncDF3 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000") .option(DataSourceReadOptions.END_INSTANTTIME.key, commit2Time) @@ -159,6 +173,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // Test incremental query has no instant in range val emptyIncDF = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000") .option(DataSourceReadOptions.END_INSTANTTIME.key, "001") @@ -167,6 +182,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // Unmerge val hudiSnapshotSkipMergeDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) .load(basePath + "/*/*/*/*") @@ -176,6 +192,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // Test Read Optimized Query on MOR table val hudiRODF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(100, hudiRODF2.count()) @@ -186,10 +203,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records3 = recordsToStrings(dataGen.generateUniqueUpdates("003", 50)).asScala val inputDF3: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records3, 2)) inputDF3.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF3 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") // still 100, because we only updated the existing records @@ -203,6 +221,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // incremental query from commit2Time val hudiIncDF4 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time) .load(basePath) @@ -211,6 +230,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // skip merge incremental view // including commit 2 and commit 3 val hudiIncDF4SkipMerge = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000") .option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) @@ -226,10 +246,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records4 = recordsToStrings(newDataGen.generateInserts("004", 100)).asScala val inputDF4: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records4, 2)) inputDF4.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF4 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") // 200, because we insert 100 records to a new partition @@ -239,6 +260,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // Incremental query, 50 from log file, 100 from base file of the new partition. val hudiIncDF5 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time) .load(basePath) @@ -251,11 +273,12 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records5 = recordsToStrings(newDataGen.generateUniqueUpdates("005", 50)).asScala val inputDF5: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records5, 2)) inputDF5.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val commit5Time = HoodieDataSourceHelpers.latestCommit(fs, basePath) val hudiSnapshotDF5 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(200, hudiSnapshotDF5.count()) @@ -265,16 +288,18 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records6 = recordsToStrings(newDataGen.generateInserts("006", 2)).asScala val inputDF6: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records6, 2)) inputDF6.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.compact.inline", "true") .mode(SaveMode.Append) .save(basePath) val commit6Time = HoodieDataSourceHelpers.latestCommit(fs, basePath) val hudiSnapshotDF6 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/2020/01/10/*") assertEquals(102, hudiSnapshotDF6.count()) val hudiIncDF6 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit5Time) .option(DataSourceReadOptions.END_INSTANTTIME.key, commit6Time) @@ -285,14 +310,48 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { } @Test - def testPayloadDelete() { + def testSpill() { + val (writeOpts, readOpts) = getOpts(HoodieRecordType.SPARK) + + val records1 = recordsToStrings(dataGen.generateInserts("001", 100)).asScala + val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) + inputDF1.write.format("org.apache.hudi") + .options(writeOpts) + .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL) + .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) + .mode(SaveMode.Overwrite) + .save(basePath) + + val records2 = recordsToStrings(dataGen.generateUniqueUpdates("002", 100)).asScala + val inputDF2: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records2, 2)) + inputDF2.write.format("org.apache.hudi") + .options(writeOpts) + .mode(SaveMode.Append) + .save(basePath) + + // Make force spill + spark.sparkContext.hadoopConfiguration.set(HoodieRealtimeConfig.COMPACTION_MEMORY_FRACTION_PROP, "0.00001") + val hudiSnapshotDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) + .load(basePath + "/*/*/*/*") + assertEquals(100, hudiSnapshotDF1.count()) // still 100, since we only updated + spark.sparkContext.hadoopConfiguration.set(HoodieRealtimeConfig.COMPACTION_MEMORY_FRACTION_PROP, HoodieRealtimeConfig.DEFAULT_COMPACTION_MEMORY_FRACTION) + } + + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testPayloadDelete(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // First Operation: // Producing parquet files to three default partitions. // SNAPSHOT view on MOR table with parquet files only. val records1 = recordsToStrings(dataGen.generateInserts("001", 100)).asScala val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) @@ -300,6 +359,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .save(basePath) assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) val hudiSnapshotDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(100, hudiSnapshotDF1.count()) // still 100, since we only updated @@ -310,10 +370,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records2 = recordsToStrings(dataGen.generateUniqueDeleteRecords("002", 50)).asScala val inputDF2: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(50, hudiSnapshotDF2.count()) // 50 records were deleted @@ -325,6 +386,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // unmerge query, skip the delete records val hudiSnapshotDF2Unmerge = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) .load(basePath + "/*/*/*/*") @@ -332,6 +394,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // incremental query, read 50 delete records from log file and get 0 count. val hudiIncDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time) .load(basePath) @@ -343,23 +406,27 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records3 = recordsToStrings(dataGen.generateUniqueDeleteRecords("003", 50)).asScala val inputDF3: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records3, 2)) inputDF3.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF3 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(0, hudiSnapshotDF3.count()) // 100 records were deleted, 0 record to load } - @Test - def testPrunedFiltered() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testPrunedFiltered(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // First Operation: // Producing parquet files to three default partitions. // SNAPSHOT view on MOR table with parquet files only. // Overriding the partition-path field - val opts = commonOpts + (DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition_path") + val opts = writeOpts + (DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition_path") val hoodieRecords1 = dataGen.generateInserts("001", 100) @@ -373,6 +440,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .mode(SaveMode.Overwrite) .save(basePath) val hudiSnapshotDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") val commit1Time = hudiSnapshotDF1.select("_hoodie_commit_time").head().get(0).toString @@ -395,18 +463,22 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") val hudiIncDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000") .load(basePath) val hudiIncDF1Skipmerge = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000") .load(basePath) val hudiIncDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit1Time) .load(basePath) @@ -441,6 +513,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .mode(SaveMode.Append).save(basePath) val hudiSnapshotDF3 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") @@ -450,8 +523,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { assertEquals(0, hudiSnapshotDF3.filter("rider = 'rider-003'").count()) } - @Test - def testVectorizedReader() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testVectorizedReader(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + spark.conf.set("spark.sql.parquet.enableVectorizedReader", true) assertTrue(spark.conf.get("spark.sql.parquet.enableVectorizedReader").toBoolean) // Vectorized Reader will only be triggered with AtomicType schema, @@ -460,13 +536,14 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records1 = recordsToStrings(dataGen.generateInsertsAsPerSchema("001", 100, schema)).asScala val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) .mode(SaveMode.Overwrite) .save(basePath) val hudiSnapshotDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(100, hudiSnapshotDF1.count()) @@ -475,10 +552,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .asScala val inputDF2: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(100, hudiSnapshotDF2.count()) @@ -496,7 +574,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { hudiSnapshotDF2.show(1) } - @Test def testNoPrecombine() { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testNoPrecombine(recordType: HoodieRecordType) { + val (writeOpts, readOpts) = getOpts(recordType) + // Insert Operation val records = recordsToStrings(dataGen.generateInserts("000", 100)).asScala val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) @@ -507,7 +589,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" - ) + ) ++ writeOpts inputDF.write.format("hudi") .options(commonOptsNoPreCombine) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) @@ -515,43 +597,46 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .mode(SaveMode.Overwrite) .save(basePath) - spark.read.format("org.apache.hudi").load(basePath).count() + spark.read.format("org.apache.hudi").options(readOpts).load(basePath).count() } - @Test - def testPreCombineFiledForReadMOR(): Unit = { - writeData((1, "a0", 10, 100, false)) - checkAnswer((1, "a0", 10, 100, false)) + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testPreCombineFiledForReadMOR(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + + writeData((1, "a0", 10, 100, false), writeOpts) + checkAnswer((1, "a0", 10, 100, false), readOpts) - writeData((1, "a0", 12, 99, false)) + writeData((1, "a0", 12, 99, false), writeOpts) // The value has not update, because the version 99 < 100 - checkAnswer((1, "a0", 10, 100, false)) + checkAnswer((1, "a0", 10, 100, false), readOpts) - writeData((1, "a0", 12, 101, false)) + writeData((1, "a0", 12, 101, false), writeOpts) // The value has update - checkAnswer((1, "a0", 12, 101, false)) + checkAnswer((1, "a0", 12, 101, false), readOpts) - writeData((1, "a0", 14, 98, false)) + writeData((1, "a0", 14, 98, false), writeOpts) // Latest value should be ignored if preCombine honors ordering - checkAnswer((1, "a0", 12, 101, false)) + checkAnswer((1, "a0", 12, 101, false), readOpts) - writeData((1, "a0", 16, 97, true)) + writeData((1, "a0", 16, 97, true), writeOpts) // Ordering value will be honored, the delete record is considered as obsolete // because it has smaller version number (97 < 101) - checkAnswer((1, "a0", 12, 101, false)) + checkAnswer((1, "a0", 12, 101, false), readOpts) - writeData((1, "a0", 18, 96, false)) + writeData((1, "a0", 18, 96, false), writeOpts) // Ordering value will be honored, the data record is considered as obsolete // because it has smaller version number (96 < 101) - checkAnswer((1, "a0", 12, 101, false)) + checkAnswer((1, "a0", 12, 101, false), readOpts) } - private def writeData(data: (Int, String, Int, Int, Boolean)): Unit = { + private def writeData(data: (Int, String, Int, Int, Boolean), opts: Map[String, String]): Unit = { val _spark = spark import _spark.implicits._ val df = Seq(data).toDF("id", "name", "value", "version", "_hoodie_is_deleted") df.write.format("org.apache.hudi") - .options(commonOpts) + .options(opts) // use DefaultHoodieRecordPayload here .option(PAYLOAD_CLASS_NAME.key, classOf[DefaultHoodieRecordPayload].getCanonicalName) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) @@ -563,8 +648,9 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .save(basePath) } - private def checkAnswer(expect: (Int, String, Int, Int, Boolean)): Unit = { + private def checkAnswer(expect: (Int, String, Int, Int, Boolean), opts: Map[String, String]): Unit = { val readDf = spark.read.format("org.apache.hudi") + .options(opts) .load(basePath + "/*") if (expect._5) { if (!readDf.isEmpty) { @@ -599,14 +685,19 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { } @ParameterizedTest - @CsvSource(Array("true,false", "true,true", "false,true", "false,false")) - def testQueryMORWithBasePathAndFileIndex(partitionEncode: Boolean, isMetadataEnabled: Boolean): Unit = { + @CsvSource(Array( + "true,false,AVRO", "true,true,AVRO", "false,true,AVRO", "false,false,AVRO", + "true,false,SPARK", "true,true,SPARK", "false,true,SPARK", "false,false,SPARK" + )) + def testQueryMORWithBasePathAndFileIndex(partitionEncode: Boolean, isMetadataEnabled: Boolean, recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val N = 20 // Test query with partition prune if URL_ENCODE_PARTITIONING has enable val records1 = dataGen.generateInsertsContainsAllPartitions("000", N) val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1).asScala, 2)) inputDF1.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) .option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode) @@ -618,6 +709,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val countIn20160315 = records1.asScala.count(record => record.getPartitionPath == "2016/03/15") // query the partition by filter val count1 = spark.read.format("hudi") + .options(readOpts) .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) .load(basePath) .filter("partition = '2016/03/15'") @@ -627,6 +719,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // query the partition by path val partitionPath = if (partitionEncode) "2016%2F03%2F15" else "2016/03/15" val count2 = spark.read.format("hudi") + .options(readOpts) .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) .load(basePath + s"/$partitionPath") .count() @@ -636,7 +729,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records2 = dataGen.generateInsertsContainsAllPartitions("000", N + 1) val inputDF2 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records2).asScala, 2)) inputDF2.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) .option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode) @@ -645,6 +738,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .save(basePath) // Incremental query without "*" in path val hoodieIncViewDF1 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime1) .load(basePath) @@ -652,8 +746,13 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { } @ParameterizedTest - @CsvSource(Array("true, false", "false, true", "false, false", "true, true")) - def testMORPartitionPrune(partitionEncode: Boolean, hiveStylePartition: Boolean): Unit = { + @CsvSource(Array( + "true, false, AVRO", "false, true, AVRO", "false, false, AVRO", "true, true, AVRO", + "true, false, SPARK", "false, true, SPARK", "false, false, SPARK", "true, true, SPARK" + )) + def testMORPartitionPrune(partitionEncode: Boolean, hiveStylePartition: Boolean, recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val partitions = Array("2021/03/01", "2021/03/02", "2021/03/03", "2021/03/04", "2021/03/05") val newDataGen = new HoodieTestDataGenerator(partitions) val records1 = newDataGen.generateInsertsContainsAllPartitions("000", 100).asScala @@ -662,7 +761,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val partitionCounts = partitions.map(p => p -> records1.count(r => r.getPartitionPath == p)).toMap inputDF1.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) .option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode) @@ -671,42 +770,49 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .save(basePath) val count1 = spark.read.format("hudi") + .options(readOpts) .load(basePath) .filter("partition = '2021/03/01'") .count() assertEquals(partitionCounts("2021/03/01"), count1) val count2 = spark.read.format("hudi") + .options(readOpts) .load(basePath) .filter("partition > '2021/03/01' and partition < '2021/03/03'") .count() assertEquals(partitionCounts("2021/03/02"), count2) val count3 = spark.read.format("hudi") + .options(readOpts) .load(basePath) .filter("partition != '2021/03/01'") .count() assertEquals(records1.size - partitionCounts("2021/03/01"), count3) val count4 = spark.read.format("hudi") + .options(readOpts) .load(basePath) .filter("partition like '2021/03/03%'") .count() assertEquals(partitionCounts("2021/03/03"), count4) val count5 = spark.read.format("hudi") + .options(readOpts) .load(basePath) .filter("partition like '%2021/03/%'") .count() assertEquals(records1.size, count5) val count6 = spark.read.format("hudi") + .options(readOpts) .load(basePath) .filter("partition = '2021/03/01' or partition = '2021/03/05'") .count() assertEquals(partitionCounts("2021/03/01") + partitionCounts("2021/03/05"), count6) val count7 = spark.read.format("hudi") + .options(readOpts) .load(basePath) .filter("substr(partition, 9, 10) = '03'") .count() @@ -714,13 +820,16 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { assertEquals(partitionCounts("2021/03/03"), count7) } - @Test - def testReadPathsForMergeOnReadTable(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testReadPathsForMergeOnReadTable(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + // Paths only baseFiles val records1 = dataGen.generateInserts("001", 100) val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1).asScala, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) @@ -734,10 +843,10 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records2 = dataGen.generateUniqueDeleteRecords("002", 100) val inputDF2: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records2).asScala, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .mode(SaveMode.Append) .save(basePath) - val hudiReadPathDF1 = spark.read.format("org.apache.hudi") + val hudiReadPathDF1 = spark.read.options(readOpts).format("org.apache.hudi") .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .option(DataSourceReadOptions.READ_PATHS.key, baseFilePath) .load() @@ -753,6 +862,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val readPaths = baseFilePath + "," + logFilePath val hudiReadPathDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .option(DataSourceReadOptions.READ_PATHS.key, readPaths) .load() @@ -760,13 +870,16 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { assertEquals(0, hudiReadPathDF2.count()) } - @Test - def testReadPathsForOnlyLogFiles(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testReadPathsForOnlyLogFiles(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + initMetaClient(HoodieTableType.MERGE_ON_READ) val records1 = dataGen.generateInsertsContainsAllPartitions("000", 20) val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1).asScala, 2)) inputDF1.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) // Use InMemoryIndex to generate log only mor table. @@ -784,7 +897,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val records2 = dataGen.generateInsertsContainsAllPartitions("000", 20) val inputDF2 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records2).asScala, 2)) inputDF2.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) // Use InMemoryIndex to generate log only mor table. @@ -796,7 +909,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val expectedCount1 = records1.asScala.count(record => record.getPartitionPath == dataGen.getPartitionPaths.head) - val hudiReadPathDF = spark.read.format("org.apache.hudi") + val hudiReadPathDF = spark.read.options(readOpts).format("org.apache.hudi") .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .option(DataSourceReadOptions.READ_PATHS.key, logFilePath) .load() @@ -804,13 +917,16 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { assertEquals(expectedCount1, hudiReadPathDF.count()) } - @Test - def testReadLogOnlyMergeOnReadTable(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testReadLogOnlyMergeOnReadTable(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + initMetaClient(HoodieTableType.MERGE_ON_READ) val records1 = dataGen.generateInsertsContainsAllPartitions("000", 20) val inputDF = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1).asScala, 2)) inputDF.write.format("hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) // Use InMemoryIndex to generate log only mor table. @@ -822,15 +938,18 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // Test read logs only mor table with glob paths. assertEquals(20, spark.read.format("hudi").load(basePath + "/*/*/*/*").count()) // Test read log only mor table. - assertEquals(20, spark.read.format("hudi").load(basePath).count()) + assertEquals(20, spark.read.format("hudi").options(readOpts).load(basePath).count()) } - @Test - def testTempFilesCleanForClustering(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testTempFilesCleanForClustering(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val records1 = recordsToStrings(dataGen.generateInserts("001", 1000)).asScala val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key(), DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) // option for clustering @@ -843,8 +962,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { assertEquals(true, fs.listStatus(tempPath).isEmpty) } - @Test - def testClusteringOnNullableColumn(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testClusteringOnNullableColumn(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val records1 = recordsToStrings(dataGen.generateInserts("001", 1000)).asScala val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2)) .withColumn("cluster_id", when(expr("end_lon < 0.2 "), lit(null).cast("string")) @@ -852,7 +974,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .withColumn("struct_cluster_col", when(expr("end_lon < 0.1"), lit(null)) .otherwise(struct(col("cluster_id"), col("_row_key")))) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key(), DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) // option for clustering @@ -865,15 +987,18 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .save(basePath) } - @Test - def testHoodieIsDeletedMOR(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testHoodieIsDeletedMOR(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val numRecords = 100 val numRecordsToDelete = 2 val schema = HoodieTestDataGenerator.SHORT_TRIP_SCHEMA val records0 = recordsToStrings(dataGen.generateInsertsAsPerSchema("000", numRecords, schema)).asScala val inputDF0 = spark.read.json(spark.sparkContext.parallelize(records0, 2)) inputDF0.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option("hoodie.compact.inline", "false") .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) @@ -881,6 +1006,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { .save(basePath) val snapshotDF0 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(numRecords, snapshotDF0.count()) @@ -890,12 +1016,13 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val df2 = dropDf.withColumn("_hoodie_is_deleted", lit(true).cast(BooleanType)) df2.write.format("org.apache.hudi") - .options(commonOpts) + .options(writeOpts) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) .mode(SaveMode.Append) .save(basePath) val snapshotDF2 = spark.read.format("org.apache.hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") assertEquals(numRecords - numRecordsToDelete, snapshotDF2.count()) @@ -908,8 +1035,11 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { * * For MOR table, test all the three query modes. */ - @Test - def testPrunePartitionForTimestampBasedKeyGenerator(): Unit = { + @ParameterizedTest + @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) + def testPrunePartitionForTimestampBasedKeyGenerator(recordType: HoodieRecordType): Unit = { + val (writeOpts, readOpts) = getOpts(recordType) + val options = commonOpts ++ Map( "hoodie.compact.inline" -> "false", DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, @@ -918,7 +1048,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP -> "yyyy/MM/dd", Config.TIMESTAMP_TIMEZONE_FORMAT_PROP -> "GMT+8:00", Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP -> "yyyy-MM-dd" - ) + ) ++ writeOpts val dataGen1 = new HoodieTestDataGenerator(Array("2022-01-01")) val records1 = recordsToStrings(dataGen1.generateInserts("001", 50)).asScala @@ -951,7 +1081,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { val commit3Time = metaClient.reloadActiveTimeline.lastInstant().get().getTimestamp // snapshot query - val snapshotQueryRes = spark.read.format("hudi").load(basePath) + val snapshotQueryRes = spark.read.format("hudi").options(readOpts).load(basePath) assertEquals(snapshotQueryRes.where(s"_hoodie_commit_time = '$commit1Time'").count, 50) assertEquals(snapshotQueryRes.where(s"_hoodie_commit_time = '$commit2Time'").count, 40) assertEquals(snapshotQueryRes.where(s"_hoodie_commit_time = '$commit3Time'").count, 20) @@ -961,6 +1091,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // read_optimized query val readOptimizedQueryRes = spark.read.format("hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL) .load(basePath) // TODO(HUDI-3204) we have to revert this to pre-existing behavior from 0.10 @@ -971,6 +1102,7 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { // incremental query val incrementalQueryRes = spark.read.format("hudi") + .options(readOpts) .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time) .option(DataSourceReadOptions.END_INSTANTTIME.key, commit3Time) @@ -978,4 +1110,19 @@ class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { assertEquals(incrementalQueryRes.where("partition = '2022-01-01'").count, 0) assertEquals(incrementalQueryRes.where("partition = '2022-01-02'").count, 20) } + + def getOpts(recordType: HoodieRecordType): (Map[String, String], Map[String, String]) = { + val writeOpts = if (recordType == HoodieRecordType.SPARK) { + commonOpts ++ sparkOpts + } else { + commonOpts + } + val readOpts = if (recordType == HoodieRecordType.SPARK) { + sparkOpts + } else { + Map.empty[String, String] + } + + (writeOpts, readOpts) + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMetadataTableWithSparkDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMetadataTableWithSparkDataSource.scala index 02e0ee6dfd9bc..9942132aba807 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMetadataTableWithSparkDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMetadataTableWithSparkDataSource.scala @@ -20,6 +20,7 @@ package org.apache.hudi.functional import org.apache.hudi.DataSourceWriteOptions import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.model.HoodieAvroRecordMerger import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.config.HoodieWriteConfig @@ -27,7 +28,6 @@ import org.apache.hudi.testutils.SparkClientFunctionalTestHarness import org.apache.spark.sql.SaveMode import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.{Tag, Test} - import scala.collection.JavaConverters._ @Tag("functional") diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala index 00ab7091445db..214b725641764 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala @@ -19,11 +19,11 @@ package org.apache.hudi.functional import org.apache.avro.Schema import org.apache.hudi.HoodieBaseRelation.projectSchema -import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig} import org.apache.hudi.common.model.{HoodieRecord, OverwriteNonDefaultsWithLatestAvroPayload} import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.common.testutils.{HadoopMapRedUtils, HoodieTestDataGenerator} -import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig} +import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.keygen.NonpartitionedKeyGenerator import org.apache.hudi.testutils.SparkClientFunctionalTestHarness import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, DefaultSource, HoodieBaseRelation, HoodieSparkUtils, HoodieUnsafeRDD} @@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.{Dataset, HoodieUnsafeUtils, Row, SaveMode} import org.junit.jupiter.api.Assertions.{assertEquals, fail} import org.junit.jupiter.api.{Disabled, Tag, Test} - import scala.collection.JavaConverters._ @Tag("functional") diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestStructuredStreaming.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestStructuredStreaming.scala index 1382bafb7621a..95f75294cd4b3 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestStructuredStreaming.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestStructuredStreaming.scala @@ -24,16 +24,18 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestTable} import org.apache.hudi.common.util.CollectionUtils -import org.apache.hudi.config.{HoodieClusteringConfig, HoodieCompactionConfig, HoodieStorageConfig, HoodieWriteConfig} +import org.apache.hudi.common.config.HoodieStorageConfig +import org.apache.hudi.config.{HoodieClusteringConfig, HoodieCompactionConfig, HoodieWriteConfig} import org.apache.hudi.exception.TableNotFoundException import org.apache.hudi.testutils.HoodieClientTestBase import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers} import org.apache.log4j.LogManager + import org.apache.spark.sql._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.types.StructType import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} -import org.junit.jupiter.api.{AfterEach, BeforeEach} +import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{EnumSource, ValueSource} @@ -42,6 +44,8 @@ import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.duration.Duration import scala.concurrent.{Await, Future} +import org.apache.hudi.common.config.HoodieStorageConfig + /** * Basic tests on the spark datasource for structured streaming sink */ diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/ReadAndWriteWithoutAvroBenchmark.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/ReadAndWriteWithoutAvroBenchmark.scala new file mode 100644 index 0000000000000..ebdb943ed8850 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/ReadAndWriteWithoutAvroBenchmark.scala @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.config.HoodieStorageConfig +import org.apache.hudi.common.model.HoodieAvroRecordMerger +import org.apache.hudi.config.{HoodieCompactionConfig, HoodieWriteConfig} +import org.apache.hudi.{HoodieSparkRecordMerger, HoodieSparkUtils} + +import org.apache.spark.SparkConf +import org.apache.spark.hudi.benchmark.{HoodieBenchmark, HoodieBenchmarkBase} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.hudi.HoodieSparkSessionExtension +import org.apache.spark.sql.{DataFrame, SparkSession} + +object ReadAndWriteWithoutAvroBenchmark extends HoodieBenchmarkBase { + + protected val spark: SparkSession = getSparkSession + private val avroTable = "avro_merger_table" + private val sparkTable = "spark_merger_table" + + def getSparkSession: SparkSession = SparkSession + .builder() + .master("local[4]") + .appName(this.getClass.getCanonicalName) + .withExtensions(new HoodieSparkSessionExtension) + .config("spark.driver.memory", "4G") + .config("spark.executor.memory", "4G") + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config("hoodie.insert.shuffle.parallelism", "2") + .config("hoodie.upsert.shuffle.parallelism", "2") + .config("hoodie.delete.shuffle.parallelism", "2") + .config("spark.sql.session.timeZone", "CTT") + .config(sparkConf()) + .getOrCreate() + + def sparkConf(): SparkConf = { + val sparkConf = new SparkConf() + if (HoodieSparkUtils.gteqSpark3_2) { + sparkConf.set("spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.hudi.catalog.HoodieCatalog") + } + sparkConf + } + + private def createComplexDataFrame(rowNum: Long, colNum: Int): DataFrame = { + var df = spark.range(0, rowNum).toDF("id") + .withColumn("t1", lit(1)) + .withColumn("d1", lit(12.99d)) + .withColumn("s1", lit("s1")) + .withColumn("s2", lit("s2")) + .withColumn("s3", lit("s3")) + for (i <- 0 to colNum) { + df = df.withColumn(s"struct$i", struct(col("s1").as("st1"), col("s2").as("st2"), col("s3").as("st3"))) + .withColumn(s"map$i", map(col("s1"), col("s2"))) + .withColumn(s"array$i", split(col("s1"), " ")) + } + df + } + + private def prepareHoodieTable(tableName: String, path: String, tableType: String, mergerType: String, df: DataFrame): Unit = { + df.collect() + df.createOrReplaceTempView("input_df") + if (spark.catalog.tableExists(tableName)) { + spark.sql(s"drop table if exists $tableName") + } + spark.sql( + s""" + |create table $tableName using hudi + |tblproperties( + | primaryKey = 'id', + | preCombineField = 's1', + | type = '$tableType', + | ${HoodieWriteConfig.MERGER_IMPLS.key} = '$mergerType', + | ${HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT.key} = 'parquet', + | ${HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key()} = '10') + |location '$path' + |As + |select * from input_df + """.stripMargin) + } + + /** + * OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Mac OS X 12.4 + * Apple M1 Pro + * pref insert overwrite: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative + * ----------------------------------------------------------------------------------------------------------------------------------- + * org.apache.hudi.common.model.HoodieAvroRecordMerger 16714 17107 353 0.1 16714.5 1.0X + * org.apache.hudi.HoodieSparkRecordMerger 12654 13924 1100 0.1 12653.8 1.3X + */ + private def overwriteBenchmark(): Unit = { + val df = createComplexDataFrame(1000000, 1) + val benchmark = new HoodieBenchmark("pref insert overwrite", 1000000, 3) + Seq(classOf[HoodieAvroRecordMerger].getName, classOf[HoodieSparkRecordMerger].getName).zip(Seq(avroTable, sparkTable)).foreach { + case (merger, tableName) => benchmark.addCase(merger) { _ => + withTempDir { f => + prepareHoodieTable(tableName, new Path(f.getCanonicalPath, tableName).toUri.toString, "mor", merger, df) + } + } + } + benchmark.run() + } + + private def upsertThenReadBenchmark(): Unit = { + val avroMerger = classOf[HoodieAvroRecordMerger].getName + val sparkMerger = classOf[HoodieSparkRecordMerger].getName + val df = createComplexDataFrame(1000000, 1) + withTempDir { avroPath => + withTempDir { sparkPath => + val upsertBenchmark = new HoodieBenchmark("pref upsert", 1000000, 3) + prepareHoodieTable(avroTable, new Path(avroPath.getCanonicalPath, avroTable).toUri.toString, "mor", avroMerger, df) + prepareHoodieTable(sparkTable, new Path(sparkPath.getCanonicalPath, sparkTable).toUri.toString, "mor", sparkMerger, df) + df.createOrReplaceTempView("input_df") + Seq(avroMerger, sparkMerger).zip(Seq(avroTable, sparkTable)).foreach { + case (merger, tableName) => upsertBenchmark.addCase(merger) { _ => + spark.sql(s"update $tableName set s1 = 's1_new' where id > 0") + } + } + upsertBenchmark.run() + + val readBenchmark = new HoodieBenchmark("pref read", 1000000, 3) + Seq(avroMerger, sparkMerger).zip(Seq(avroTable, sparkTable)).foreach { + case (merger, tableName) => readBenchmark.addCase(merger) { _ => + spark.sql(s"select * from $tableName").collect() + } + } + readBenchmark.run() + + } + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + overwriteBenchmark() + upsertThenReadBenchmark() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieOptionConfig.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieOptionConfig.scala index 44c23d146c68b..2cfa8be36f8cb 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieOptionConfig.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieOptionConfig.scala @@ -17,8 +17,9 @@ package org.apache.spark.sql.hudi -import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, OverwriteWithLatestAvroPayload} +import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieAvroRecordMerger, OverwriteWithLatestAvroPayload} import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.common.util.StringUtils import org.apache.hudi.testutils.SparkClientFunctionalTestHarness import org.apache.spark.sql.types._ import org.junit.jupiter.api.Assertions.assertTrue @@ -31,15 +32,18 @@ class TestHoodieOptionConfig extends SparkClientFunctionalTestHarness { def testWithDefaultSqlOptions(): Unit = { val ops1 = Map("primaryKey" -> "id") val with1 = HoodieOptionConfig.withDefaultSqlOptions(ops1) - assertTrue(with1.size == 3) + assertTrue(with1.size == 4) assertTrue(with1("primaryKey") == "id") assertTrue(with1("type") == "cow") assertTrue(with1("payloadClass") == classOf[OverwriteWithLatestAvroPayload].getName) + assertTrue(with1("mergerStrategy") == StringUtils.DEFAULT_MERGER_STRATEGY_UUID) val ops2 = Map("primaryKey" -> "id", "preCombineField" -> "timestamp", "type" -> "mor", - "payloadClass" -> classOf[DefaultHoodieRecordPayload].getName + "payloadClass" -> classOf[DefaultHoodieRecordPayload].getName, + "mergerImpls" -> classOf[HoodieAvroRecordMerger].getName, + "mergerStrategy" -> StringUtils.DEFAULT_MERGER_STRATEGY_UUID ) val with2 = HoodieOptionConfig.withDefaultSqlOptions(ops2) assertTrue(ops2 == with2) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSqlConf.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSqlConf.scala index 90d0734945d7d..72bd71e4bba53 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSqlConf.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSqlConf.scala @@ -85,7 +85,8 @@ class TestSqlConf extends HoodieSparkSqlTestBase with BeforeAndAfter { assertResult(HoodieTableType.MERGE_ON_READ)(new HoodieTableConfig( new Path(tablePath).getFileSystem(new Configuration), s"$tablePath/" + HoodieTableMetaClient.METAFOLDER_NAME, - HoodieTableConfig.PAYLOAD_CLASS_NAME.defaultValue).getTableType) + HoodieTableConfig.PAYLOAD_CLASS_NAME.defaultValue, + HoodieTableConfig.MERGER_STRATEGY.defaultValue).getTableType) // Manually pass incremental configs to global configs to make sure Hudi query is able to load the // global configs diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java index cccee5fe1a6ba..4df544694418e 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java @@ -34,6 +34,7 @@ import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.log.HoodieLogFormat; @@ -1047,7 +1048,7 @@ private Option readBloomFilterFromFile(String partitionPath, St Path path = new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), partitionPath), filename); HoodieFileReader fileReader; try { - fileReader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), path); + fileReader = HoodieFileReaderFactory.getReaderFactory(HoodieRecordType.AVRO).getFileReader(metaClient.getHadoopConf(), path); } catch (IOException e) { Log.error("Failed to get file reader for " + path + " " + e.getMessage()); return Option.empty(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index 1f5756a7ae4c3..3cf0a3354e03c 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -255,7 +255,10 @@ public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, Sche */ public void refreshTimeline() throws IOException { if (fs.exists(new Path(cfg.targetBasePath))) { - HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(new Configuration(fs.getConf())).setBasePath(cfg.targetBasePath).setPayloadClassName(cfg.payloadClassName).build(); + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(new Configuration(fs.getConf())) + .setBasePath(cfg.targetBasePath).setPayloadClassName(cfg.payloadClassName) + .setMergerStrategy(cfg.mergerStrategy) + .build(); switch (meta.getTableType()) { case COPY_ON_WRITE: this.commitTimelineOpt = Option.of(meta.getActiveTimeline().getCommitTimeline().filterCompletedInstants()); @@ -277,7 +280,6 @@ public void refreshTimeline() throws IOException { .setTableName(cfg.targetTableName) .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) .setPayloadClassName(cfg.payloadClassName) - .setMergeClassName(cfg.mergeClassName) .setBaseFileFormat(cfg.baseFileFormat) .setPartitionFields(partitionColumns) .setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())) @@ -376,7 +378,6 @@ public Pair>> readFromSource( .setTableName(cfg.targetTableName) .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) .setPayloadClassName(cfg.payloadClassName) - .setMergeClassName(cfg.mergeClassName) .setBaseFileFormat(cfg.baseFileFormat) .setPartitionFields(partitionColumns) .setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())) @@ -845,7 +846,11 @@ private Schema getSchemaForWriteConfig(Schema targetSchema) { if (SchemaCompatibility.checkReaderWriterCompatibility(targetSchema, InputBatch.NULL_SCHEMA).getType() == SchemaCompatibility.SchemaCompatibilityType.COMPATIBLE && SchemaCompatibility.checkReaderWriterCompatibility(InputBatch.NULL_SCHEMA, targetSchema).getType() == SchemaCompatibility.SchemaCompatibilityType.COMPATIBLE) { // target schema is null. fetch schema from commit metadata and use it - HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(new Configuration(fs.getConf())).setBasePath(cfg.targetBasePath).setPayloadClassName(cfg.payloadClassName).build(); + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(new Configuration(fs.getConf())) + .setBasePath(cfg.targetBasePath) + .setPayloadClassName(cfg.payloadClassName) + .setMergerStrategy(cfg.mergerStrategy) + .build(); int totalCompleted = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants(); if (totalCompleted > 0) { try { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java index 3dcff565251ad..108be047e760e 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -32,7 +32,7 @@ import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieAvroRecordMerge; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -273,9 +273,13 @@ public static class Config implements Serializable { + "a GenericRecord. Implement your own, if you want to do something other than overwriting existing value") public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName(); - @Parameter(names = {"--merge-class"}, description = "Implements of HoodieMerge, that defines how to merge two records." - + "Implement your own, if you want to implement specific record merge logic.") - public String mergeClassName = HoodieAvroRecordMerge.class.getName(); + @Parameter(names = {"--merger-impls"}, description = "List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. " + + "These merger impls will filter by merger-strategy " + + "Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)") + public String mergerImpls = HoodieAvroRecordMerger.class.getName(); + + @Parameter(names = {"--merger-strategy"}, description = "Id of merger strategy. Hudi will pick RecordMergers in merger-impls which has the same merger strategy id") + public String mergerStrategy = StringUtils.DEFAULT_MERGER_STRATEGY_UUID; @Parameter(names = {"--schemaprovider-class"}, description = "subclass of org.apache.hudi.utilities.schema" + ".SchemaProvider to attach schemas to input & target table data, built in options: "