From aa6e1c213dc9f8c13215265cc27f3b35a22a204d Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Wed, 27 Dec 2023 20:41:03 -0500 Subject: [PATCH 01/41] squash commits --- ...hadoop284_hive233_spark244_mac_aarch64.yml | 3 +- .../BaseSparkInternalRowReaderContext.java | 45 +--- .../common/engine/HoodieReaderContext.java | 29 ++- .../hudi/common/model/HoodieRecord.java | 2 +- .../table/read/HoodieFileGroupReader.java | 6 + .../hudi/hadoop/HiveHoodieReaderContext.java | 221 +++++++++++++++++ .../HoodieFileGroupReaderRecordReader.java | 231 ++++++++++++++++++ .../apache/hudi/hadoop/HoodieHiveRecord.java | 231 ++++++++++++++++++ .../hudi/hadoop/HoodieHiveRecordMerger.java | 71 ++++++ .../hudi/hadoop/HoodieParquetInputFormat.java | 24 +- .../hadoop/RecordReaderValueIterator.java | 13 +- .../AbstractRealtimeRecordReader.java | 1 + .../HoodieCombineRealtimeRecordReader.java | 12 +- .../HoodieParquetRealtimeInputFormat.java | 17 +- .../utils/HoodieArrayWritableAvroUtils.java | 102 ++++++++ .../hadoop/utils/ObjectInspectorCache.java | 96 ++++++++ 16 files changed, 1029 insertions(+), 75 deletions(-) create mode 100644 hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java create mode 100644 hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java create mode 100644 hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java create mode 100644 hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecordMerger.java create mode 100644 hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java create mode 100644 hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml index 0abcf676d5f75..edc1a1249aa02 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml @@ -129,10 +129,11 @@ services: - ./hadoop.env environment: SERVICE_PRECONDITION: "hivemetastore:9083" + JAVA_TOOL_OPTIONS: "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005" ports: - "10000:10000" # JVM debugging port (will be mapped to a random port on host) - - "5005" + - "5005:64757" depends_on: - "hivemetastore" links: diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/BaseSparkInternalRowReaderContext.java b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/BaseSparkInternalRowReaderContext.java index af5e1c75b2525..644c950e146da 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/BaseSparkInternalRowReaderContext.java +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/BaseSparkInternalRowReaderContext.java @@ -69,30 +69,15 @@ public HoodieRecordMerger getRecordMerger(String mergerStrategy) { @Override public Object getValue(InternalRow row, Schema schema, String fieldName) { - return getFieldValueFromInternalRow(row, schema, fieldName); - } - - @Override - public String getRecordKey(InternalRow row, Schema schema) { - return getFieldValueFromInternalRow(row, schema, RECORD_KEY_METADATA_FIELD).toString(); - } - - @Override - public Comparable getOrderingValue(Option rowOption, - Map metadataMap, - Schema schema, - TypedProperties props) { - if (metadataMap.containsKey(INTERNAL_META_ORDERING_FIELD)) { - return (Comparable) metadataMap.get(INTERNAL_META_ORDERING_FIELD); - } - - if (!rowOption.isPresent()) { - return 0; + StructType structType = getCachedSchema(schema); + scala.Option cachedNestedFieldPath = + HoodieInternalRowUtils.getCachedPosList(structType, fieldName); + if (cachedNestedFieldPath.isDefined()) { + HoodieUnsafeRowUtils.NestedFieldPath nestedFieldPath = cachedNestedFieldPath.get(); + return HoodieUnsafeRowUtils.getNestedInternalRowValue(row, nestedFieldPath); + } else { + return null; } - - String orderingFieldName = ConfigUtils.getOrderingField(props); - Object value = getFieldValueFromInternalRow(rowOption.get(), schema, orderingFieldName); - return value != null ? (Comparable) value : 0; } @Override @@ -117,25 +102,13 @@ public InternalRow seal(InternalRow internalRow) { @Override public long extractRecordPosition(InternalRow record, Schema recordSchema, String fieldName, long providedPositionIfNeeded) { - Object position = getFieldValueFromInternalRow(record, recordSchema, fieldName); + Object position = getValue(record, recordSchema, fieldName); if (position != null) { return (long) position; } return providedPositionIfNeeded; } - private Object getFieldValueFromInternalRow(InternalRow row, Schema recordSchema, String fieldName) { - StructType structType = getCachedSchema(recordSchema); - scala.Option cachedNestedFieldPath = - HoodieInternalRowUtils.getCachedPosList(structType, fieldName); - if (cachedNestedFieldPath.isDefined()) { - HoodieUnsafeRowUtils.NestedFieldPath nestedFieldPath = cachedNestedFieldPath.get(); - return HoodieUnsafeRowUtils.getNestedInternalRowValue(row, nestedFieldPath); - } else { - return null; - } - } - @Override public UnaryOperator projectRecord(Schema from, Schema to) { UnsafeProjection projection = HoodieInternalRowUtils.generateUnsafeProjectionAlias(getCachedSchema(from), getCachedSchema(to)); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java index 78f58db7eb5f4..c61d63e10f1b8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordMerger; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.storage.HoodieStorage; @@ -36,6 +37,8 @@ import java.util.Map; import java.util.function.UnaryOperator; +import static org.apache.hudi.common.model.HoodieRecord.RECORD_KEY_METADATA_FIELD; + /** * An abstract reader context class for {@code HoodieFileGroupReader} to use, containing APIs for * engine-specific implementation on reading data files, getting field values from a record, @@ -113,7 +116,10 @@ public abstract ClosableIterator getFileRecordIterator( * @param schema The Avro schema of the record. * @return The record key in String. */ - public abstract String getRecordKey(T record, Schema schema); + public String getRecordKey(T record, Schema schema) { + Object val = getValue(record, schema, RECORD_KEY_METADATA_FIELD); + return val.toString(); + } /** * Gets the ordering value in particular type. @@ -124,10 +130,23 @@ public abstract ClosableIterator getFileRecordIterator( * @param props Properties. * @return The ordering value. */ - public abstract Comparable getOrderingValue(Option recordOption, + public Comparable getOrderingValue(Option recordOption, Map metadataMap, Schema schema, - TypedProperties props); + TypedProperties props) { + if (metadataMap.containsKey(INTERNAL_META_ORDERING_FIELD)) { + return (Comparable) metadataMap.get(INTERNAL_META_ORDERING_FIELD); + } + + if (!recordOption.isPresent()) { + return 0; + } + + String orderingFieldName = ConfigUtils.getOrderingField(props); + Object value = getValue(recordOption.get(), schema, orderingFieldName); + return value != null ? (Comparable) value : 0; + + } /** * Constructs a new {@link HoodieRecord} based on the record of engine-specific type and metadata for merging. @@ -218,6 +237,10 @@ public Map updateSchemaAndResetOrderingValInMetadata(Map implements Closeable { + private boolean schemaInOrder = true; private final HoodieReaderContext readerContext; private final Option hoodieBaseFileOption; private final List logFiles; @@ -199,6 +200,10 @@ private Schema generateRequiredSchema() { } private Schema maybeReorderForBootstrap(Schema input) { + if (schemaInOrder) { + return createSchemaFromFields(input.getFields().stream() + .sorted((o1, o2) -> Integer.compare(dataSchema.getField(o1.name()).pos(),dataSchema.getField(o2.name()).pos())).collect(Collectors.toList())); + } if (this.hoodieBaseFileOption.isPresent() && this.hoodieBaseFileOption.get().getBootstrapBaseFile().isPresent()) { Pair, List> requiredFields = getDataAndMetaCols(input); if (!(requiredFields.getLeft().isEmpty() || requiredFields.getRight().isEmpty())) { @@ -284,6 +289,7 @@ public T next() { } private void scanLogFiles() { + System.out.println("Scanning log files"); String path = readerState.tablePath; HoodieMergedLogRecordReader logRecordReader = HoodieMergedLogRecordReader.newBuilder() .withHoodieReaderContext(readerContext) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java new file mode 100644 index 0000000000000..11873aded1cf4 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hudi.common.engine.HoodieReaderContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieEmptyRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordMerger; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ClosableIterator; +import org.apache.hudi.common.util.collection.CloseableMappingIterator; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hadoop.utils.HoodieArrayWritableAvroUtils; +import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; +import org.apache.hudi.hadoop.utils.ObjectInspectorCache; + +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.function.UnaryOperator; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.model.HoodieRecordMerger.DEFAULT_MERGER_STRATEGY_UUID; + +public class HiveHoodieReaderContext extends HoodieReaderContext { + protected final HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator; + protected final InputSplit split; + protected final JobConf jobConf; + protected final Reporter reporter; + protected final Schema writerSchema; + protected Map hosts; + protected final Map columnTypeMap; + + private final ObjectInspectorCache objectInspectorCache; + + private final String tableName; + + private RecordReader firstRecordReader = null; + + protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator, + InputSplit split, + JobConf jobConf, + Reporter reporter, + Schema writerSchema, + Map hosts, + HoodieTableMetaClient metaClient) { + this.readerCreator = readerCreator; + this.split = split; + this.jobConf = jobConf; + this.reporter = reporter; + this.writerSchema = writerSchema; + this.hosts = hosts; + this.tableName = metaClient.getTableConfig().getTableName(); + this.objectInspectorCache = HoodieArrayWritableAvroUtils.getCacheForTable(tableName, writerSchema, jobConf); + this.columnTypeMap = objectInspectorCache.getColumnTypeMap(); + } + + @Override + public FileSystem getFs(String path, Configuration conf) { + return FSUtils.getFs(path, conf); + } + + @Override + public ClosableIterator getFileRecordIterator(Path filePath, long start, long length, Schema dataSchema, Schema requiredSchema, Configuration conf) throws IOException { + JobConf jobConfCopy = new JobConf(jobConf); + setSchemas(jobConfCopy, dataSchema, requiredSchema); + InputSplit inputSplit = new FileSplit(filePath, start, length, hosts.get(filePath.toString())); + RecordReader recordReader = readerCreator.getRecordReader(inputSplit, jobConfCopy, reporter); + if (firstRecordReader == null) { + firstRecordReader = recordReader; + } + ClosableIterator recordIterator = new RecordReaderValueIterator<>(recordReader); + if (dataSchema.equals(requiredSchema)) { + return recordIterator; + } + return new CloseableMappingIterator<>(recordIterator, projectRecord(dataSchema, requiredSchema)); + } + + private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchema) { + List dataColumnNameList = dataSchema.getFields().stream().map(Schema.Field::name).collect(Collectors.toList()); + List dataColumnTypeList = dataColumnNameList.stream().map(columnTypeMap::get).collect(Collectors.toList()); + jobConf.set(serdeConstants.LIST_COLUMNS, String.join(",", dataColumnNameList)); + jobConf.set(serdeConstants.LIST_COLUMN_TYPES, dataColumnTypeList.stream().map(TypeInfo::getQualifiedName).collect(Collectors.joining(","))); + String readColNames = requiredSchema.getFields().stream().map(f -> f.name()).collect(Collectors.joining(",")); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, requiredSchema.getFields() + .stream().map(f -> String.valueOf(dataSchema.getField(f.name()).pos())).collect(Collectors.joining(","))); + } + + @Override + public ArrayWritable convertAvroRecord(IndexedRecord avroRecord) { + //should be support timestamp? + ArrayWritable convertedRecord = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(avroRecord, avroRecord.getSchema(), false); + return convertedRecord; + } + + @Override + public HoodieRecordMerger getRecordMerger(String mergerStrategy) { + switch (mergerStrategy) { + case DEFAULT_MERGER_STRATEGY_UUID: + return new HoodieHiveRecordMerger(); + default: + throw new HoodieException("The merger strategy UUID is not supported: " + mergerStrategy); + } + } + + @Override + public Object getValue(ArrayWritable record, Schema schema, String fieldName) { + return objectInspectorCache.getValue(record, schema, fieldName); + } + + @Override + public HoodieRecord constructHoodieRecord(Option recordOption, Map metadataMap) { + if (!recordOption.isPresent()) { + return new HoodieEmptyRecord<>(new HoodieKey((String) metadataMap.get(INTERNAL_META_RECORD_KEY), (String) metadataMap.get(INTERNAL_META_PARTITION_PATH)), HoodieRecord.HoodieRecordType.AVRO); + } + Schema schema = (Schema) metadataMap.get(INTERNAL_META_SCHEMA); + ArrayWritable writable = recordOption.get(); + return new HoodieHiveRecord(new HoodieKey((String) metadataMap.get(INTERNAL_META_RECORD_KEY), (String) metadataMap.get(INTERNAL_META_PARTITION_PATH)), writable, schema, objectInspectorCache); + } + + @Override + public ArrayWritable seal(ArrayWritable record) { + return new ArrayWritable(Writable.class, Arrays.copyOf(record.get(), record.get().length)); + } + + @Override + public ClosableIterator mergeBootstrapReaders(ClosableIterator skeletonFileIterator, + ClosableIterator dataFileIterator) { + return new ClosableIterator() { + + private final ArrayWritable returnWritable = new ArrayWritable(Writable.class); + @Override + public boolean hasNext() { + if (dataFileIterator.hasNext() != skeletonFileIterator.hasNext()) { + throw new IllegalStateException("bootstrap data file iterator and skeleton file iterator are out of sync"); + } + return dataFileIterator.hasNext(); + } + + @Override + public ArrayWritable next() { + Writable[] skeletonWritable = skeletonFileIterator.next().get(); + Writable[] dataWritable = dataFileIterator.next().get(); + Writable[] mergedWritable = new Writable[skeletonWritable.length + dataWritable.length]; + System.arraycopy(skeletonWritable, 0, mergedWritable, 0, skeletonWritable.length); + System.arraycopy(dataWritable, 0, mergedWritable, skeletonWritable.length, dataWritable.length); + returnWritable.set(mergedWritable); + return returnWritable; + } + + @Override + public void close() { + skeletonFileIterator.close(); + dataFileIterator.close(); + } + }; + } + + @Override + public UnaryOperator projectRecord(Schema from, Schema to) { + return HoodieArrayWritableAvroUtils.projectRecord(from, to); + } + + public UnaryOperator reverseProjectRecord(Schema from, Schema to) { + return HoodieArrayWritableAvroUtils.reverseProject(from, to); + } + + public long getPos() throws IOException { + if (firstRecordReader != null) { + return firstRecordReader.getPos(); + } + return 0; + } + + public float getProgress() throws IOException { + if (firstRecordReader != null) { + return firstRecordReader.getProgress(); + } + return 0; + } + +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java new file mode 100644 index 0000000000000..860ab85c5b0e2 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.read.HoodieFileGroupReader; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.TablePathUtils; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit; +import org.apache.hudi.hadoop.realtime.RealtimeSplit; +import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; +import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.UnaryOperator; +import java.util.stream.Collectors; + +public class HoodieFileGroupReaderRecordReader implements RecordReader { + + public interface HiveReaderCreator { + org.apache.hadoop.mapred.RecordReader getRecordReader( + final org.apache.hadoop.mapred.InputSplit split, + final org.apache.hadoop.mapred.JobConf job, + final org.apache.hadoop.mapred.Reporter reporter + ) throws IOException; + } + + private final HiveHoodieReaderContext readerContext; + + private final HoodieFileGroupReader fileGroupReader; + private final ArrayWritable arrayWritable; + private final NullWritable nullWritable = NullWritable.get(); + + private final InputSplit inputSplit; + private final JobConf jobConf; + + private final UnaryOperator reverseProjection; + + public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, + final InputSplit split, + final JobConf jobConf, + final Reporter reporter) throws IOException { + HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); + this.inputSplit = split; + this.jobConf = jobConf; + FileSplit fileSplit = (FileSplit) split; + String tableBasePath = getTableBasePath(split, jobConf); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(jobConf) + .setBasePath(tableBasePath) + .build(); + Schema tableSchema = getLatestTableSchema(metaClient, jobConf); + Schema requestedSchema = createRequestedSchema(tableSchema, jobConf); + Map hosts = new HashMap<>(); + this.readerContext = new HiveHoodieReaderContext(readerCreator, split, jobConf, reporter, tableSchema, hosts, metaClient); + this.arrayWritable = new ArrayWritable(Writable.class, new Writable[requestedSchema.getFields().size()]); + this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, jobConf, tableBasePath, + getLatestCommitTime(split, metaClient), getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(tableBasePath, jobConf), tableBasePath), + tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), + fileSplit.getLength(), false); + this.fileGroupReader.initRecordIterators(); + this.reverseProjection = readerContext.reverseProjectRecord(requestedSchema, tableSchema); + } + + @Override + public boolean next(NullWritable key, ArrayWritable value) throws IOException { + if (!fileGroupReader.hasNext()) { + return false; + } + value.set(fileGroupReader.next().get()); + reverseProjection.apply(value); + return true; + } + + @Override + public NullWritable createKey() { + return nullWritable; + } + + @Override + public ArrayWritable createValue() { + return arrayWritable; + } + + @Override + public long getPos() throws IOException { + return readerContext.getPos(); + } + + @Override + public void close() throws IOException { + fileGroupReader.close(); + } + + @Override + public float getProgress() throws IOException { + return readerContext.getProgress(); + } + + public RealtimeSplit getSplit() { + return (RealtimeSplit) inputSplit; + } + + public JobConf getJobConf() { + return jobConf; + } + + private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, JobConf jobConf) { + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + try { + Option schemaOpt = tableSchemaResolver.getTableAvroSchemaFromLatestCommit(true); + if (schemaOpt.isPresent()) { + // Add partitioning fields to writer schema for resulting row to contain null values for these fields + String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); + List partitioningFields = + partitionFields.length() > 0 ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()) + : new ArrayList<>(); + return HoodieRealtimeRecordReaderUtils.addPartitionFields(schemaOpt.get(), partitioningFields); + } + throw new RuntimeException("Unable to get table schema"); + } catch (Exception e) { + throw new RuntimeException("Unable to get table schema", e); + } + } + + private static String getTableBasePath(InputSplit split, JobConf jobConf) throws IOException { + if (split instanceof RealtimeSplit) { + RealtimeSplit realtimeSplit = (RealtimeSplit) split; + return realtimeSplit.getBasePath(); + } else { + Path inputPath = ((FileSplit)split).getPath(); + FileSystem fs = inputPath.getFileSystem(jobConf); + Option tablePath = TablePathUtils.getTablePath(fs, inputPath); + return tablePath.get().toString(); + } + } + + private static String getLatestCommitTime(InputSplit split, HoodieTableMetaClient metaClient) { + if (split instanceof RealtimeSplit) { + return ((RealtimeSplit) split).getMaxCommitTime(); + } + Option lastInstant = metaClient.getCommitsTimeline().lastInstant(); + if (lastInstant.isPresent()) { + return lastInstant.get().getTimestamp(); + } else { + return ""; + } + } + + private static FileSlice getFileSliceFromSplit(FileSplit split, Map hosts, FileSystem fs, String tableBasePath) throws IOException { + if (split instanceof RealtimeSplit) { + RealtimeSplit realtimeSplit = (RealtimeSplit) split; + HoodieFileGroupId fileGroupId = new HoodieFileGroupId(FSUtils.getFileId(realtimeSplit.getPath().getName()), + FSUtils.getPartitionPath(realtimeSplit.getBasePath(), realtimeSplit.getPath().getParent().toString()).toString()); + String commitTime = FSUtils.getCommitTime(realtimeSplit.getPath().toString()); + BaseFile bootstrapBaseFile = null; + if (realtimeSplit instanceof HoodieRealtimeBootstrapBaseFileSplit) { + HoodieRealtimeBootstrapBaseFileSplit hoodieRealtimeBootstrapBaseFileSplit = (HoodieRealtimeBootstrapBaseFileSplit) realtimeSplit; + FileSplit bootstrapBaseFileSplit = hoodieRealtimeBootstrapBaseFileSplit.getBootstrapFileSplit(); + hosts.put(bootstrapBaseFileSplit.getPath().toString(), bootstrapBaseFileSplit.getLocations()); + bootstrapBaseFile = new BaseFile(fs.getFileStatus(bootstrapBaseFileSplit.getPath())); + } + hosts.put(realtimeSplit.getPath().toString(), realtimeSplit.getLocations()); + HoodieBaseFile hoodieBaseFile = new HoodieBaseFile(fs.getFileStatus(realtimeSplit.getPath()), bootstrapBaseFile); + return new FileSlice(fileGroupId, commitTime, hoodieBaseFile, realtimeSplit.getDeltaLogFiles()); + } + //just regular cow + HoodieFileGroupId fileGroupId = new HoodieFileGroupId(FSUtils.getFileId(split.getPath().getName()), + FSUtils.getPartitionPath(tableBasePath, split.getPath().getParent().toString()).toString()); + hosts.put(split.getPath().toString(), split.getLocations()); + return new FileSlice(fileGroupId, FSUtils.getCommitTime(split.getPath().toString()), new HoodieBaseFile(fs.getFileStatus(split.getPath())), Collections.emptyList()); + } + + private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) { + String partitionColString = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); + Set partitionColumns; + if (partitionColString == null) { + partitionColumns = Collections.EMPTY_SET; + } else { + partitionColumns = Arrays.stream(partitionColString.split(",")).collect(Collectors.toSet()); + } + return HoodieAvroUtils.generateProjectionSchema(tableSchema, + Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java new file mode 100644 index 0000000000000..702f4880d79a5 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.MetadataValues; +import org.apache.hudi.common.util.ConfigUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.hadoop.utils.HoodieArrayWritableAvroUtils; +import org.apache.hudi.hadoop.utils.ObjectInspectorCache; +import org.apache.hudi.keygen.BaseKeyGenerator; + +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Input; +import com.esotericsoftware.kryo.io.Output; +import org.apache.avro.Schema; +import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.Properties; + +public class HoodieHiveRecord extends HoodieRecord { + + private boolean copy; + private boolean isDeleted; + + public boolean isDeleted() { + return isDeleted; + } + + private final ArrayWritableObjectInspector objectInspector; + + private final ObjectInspectorCache objectInspectorCache; + + protected Schema schema; + public HoodieHiveRecord(HoodieKey key, ArrayWritable data, Schema schema, ObjectInspectorCache objectInspectorCache) { + super(key, data); + this.objectInspector = objectInspectorCache.getObjectInspector(schema); + this.objectInspectorCache = objectInspectorCache; + this.schema = schema; + this.copy = false; + isDeleted = data == null; + } + + private HoodieHiveRecord(HoodieKey key, ArrayWritable data, Schema schema, HoodieOperation operation, boolean isCopy, + ArrayWritableObjectInspector objectInspector, ObjectInspectorCache objectInspectorCache) { + super(key, data, operation, Option.empty()); + this.schema = schema; + this.copy = isCopy; + isDeleted = data == null; + this.objectInspector = objectInspector; + this.objectInspectorCache = objectInspectorCache; + } + + @Override + public HoodieRecord newInstance() { + return new HoodieHiveRecord(this.key, this.data, this.schema, this.operation, this.copy, this.objectInspector, this.objectInspectorCache); + } + + @Override + public HoodieRecord newInstance(HoodieKey key, HoodieOperation op) { + throw new UnsupportedOperationException("ObjectInspector is needed for HoodieHiveRecord"); + } + + @Override + public HoodieRecord newInstance(HoodieKey key) { + throw new UnsupportedOperationException("ObjectInspector is needed for HoodieHiveRecord"); + } + + @Override + public Comparable getOrderingValue(Schema recordSchema, Properties props) { + return (Comparable) getValue(ConfigUtils.getOrderingField(props)); + } + + @Override + public HoodieRecordType getRecordType() { + return HoodieRecordType.HIVE; + } + + @Override + public String getRecordKey(Schema recordSchema, Option keyGeneratorOpt) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + public String getRecordKey(Schema recordSchema, String keyFieldName) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + protected void writeRecordPayload(ArrayWritable payload, Kryo kryo, Output output) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + protected ArrayWritable readRecordPayload(Kryo kryo, Input input) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + public Object[] getColumnValues(Schema recordSchema, String[] columns, boolean consistentLogicalTimestampEnabled) { + Object[] objects = new Object[columns.length]; + for (int i = 0; i < objects.length; i++) { + objects[i] = getValue(columns[i]); + } + return objects; + } + + @Override + public HoodieRecord joinWith(HoodieRecord other, Schema targetSchema) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + public HoodieRecord prependMetaFields(Schema recordSchema, Schema targetSchema, MetadataValues metadataValues, Properties props) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + public HoodieRecord rewriteRecordWithNewSchema(Schema recordSchema, Properties props, Schema newSchema, Map renameCols) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + public boolean isDelete(Schema recordSchema, Properties props) throws IOException { + if (null == data) { + return true; + } + if (recordSchema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD) == null) { + return false; + } + Object deleteMarker = getValue(HoodieRecord.HOODIE_IS_DELETED_FIELD); + return deleteMarker instanceof BooleanWritable && ((BooleanWritable) deleteMarker).get(); + } + + @Override + public boolean shouldIgnore(Schema recordSchema, Properties props) throws IOException { + return false; + } + + @Override + public HoodieRecord copy() { + if (!copy) { + this.data = new ArrayWritable(Writable.class, Arrays.copyOf(this.data.get(), this.data.get().length)); + this.copy = true; + } + return this; + } + + @Override + public Option> getMetadata() { + // TODO HUDI-5282 support metaData + return Option.empty(); + } + + @Override + public HoodieRecord wrapIntoHoodieRecordPayloadWithParams(Schema recordSchema, Properties props, Option> simpleKeyGenFieldsOpt, Boolean withOperation, + Option partitionNameOp, Boolean populateMetaFieldsOp, Option schemaWithoutMetaFields) throws IOException { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + public HoodieRecord wrapIntoHoodieRecordPayloadWithKeyGen(Schema recordSchema, Properties props, Option keyGen) { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + @Override + public HoodieRecord truncateRecordKey(Schema recordSchema, Properties props, String keyFieldName) throws IOException { + data.get()[recordSchema.getIndexNamed(keyFieldName)] = new Text(); + return this; + } + + @Override + public Option toIndexedRecord(Schema recordSchema, Properties props) throws IOException { + throw new UnsupportedOperationException("Not supported for HoodieHiveRecord"); + } + + private Object getValue(String name) { + return HoodieArrayWritableAvroUtils.getWritableValue(data, objectInspector, name); + } + + protected Schema getSchema() { + return schema; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecordMerger.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecordMerger.java new file mode 100644 index 0000000000000..17a4738569e53 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecordMerger.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordMerger; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.avro.Schema; + +import java.io.IOException; + +public class HoodieHiveRecordMerger implements HoodieRecordMerger { + @Override + public Option> merge(HoodieRecord older, Schema oldSchema, HoodieRecord newer, Schema newSchema, TypedProperties props) throws IOException { + ValidationUtils.checkArgument(older.getRecordType() == HoodieRecord.HoodieRecordType.HIVE); + ValidationUtils.checkArgument(newer.getRecordType() == HoodieRecord.HoodieRecordType.HIVE); + if (newer instanceof HoodieHiveRecord) { + HoodieHiveRecord newHiveRecord = (HoodieHiveRecord) newer; + if (newHiveRecord.isDeleted()) { + return Option.empty(); + } + } else if (newer.getData() == null) { + return Option.empty(); + } + + if (older instanceof HoodieHiveRecord) { + HoodieHiveRecord oldHiveRecord = (HoodieHiveRecord) older; + if (oldHiveRecord.isDeleted()) { + return Option.of(Pair.of(newer, newSchema)); + } + } else if (older.getData() == null) { + return Option.empty(); + } + if (older.getOrderingValue(oldSchema, props).compareTo(newer.getOrderingValue(newSchema, props)) > 0) { + return Option.of(Pair.of(older, oldSchema)); + } else { + return Option.of(Pair.of(newer, newSchema)); + } + } + + @Override + public HoodieRecord.HoodieRecordType getRecordType() { + return HoodieRecord.HoodieRecordType.HIVE; + } + + @Override + public String getMergingStrategy() { + return HoodieRecordMerger.DEFAULT_MERGER_STRATEGY_UUID; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 9e6565299040b..a61ac3709f4b9 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -20,11 +20,10 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.avro.HoodieTimestampAwareParquetInputFormat; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.plan.TableScanDesc; @@ -35,9 +34,7 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; import org.apache.parquet.hadoop.ParquetInputFormat; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -106,9 +103,6 @@ public RecordReader getRecordReader(final InputSpli // ParquetInputFormat.setFilterPredicate(job, predicate); // clearOutExistingPredicate(job); // } - if (split instanceof BootstrapBaseFileSplit) { - return createBootstrappingRecordReader(split, job, reporter); - } // adapt schema evolution new SchemaEvolutionContext(split, job).doEvolutionForParquetFormat(); @@ -117,20 +111,26 @@ public RecordReader getRecordReader(final InputSpli LOG.debug("EMPLOYING DEFAULT RECORD READER - " + split); } - HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/")); + //HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/")); return getRecordReaderInternal(split, job, reporter); } private RecordReader getRecordReaderInternal(InputSplit split, JobConf job, - Reporter reporter) throws IOException { + Reporter reporter) { try { if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { - return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), split, job, reporter); + return new HoodieFileGroupReaderRecordReader((s, j, r) -> { + try { + return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), s, j, r); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }, split, job, reporter); } else { - return super.getRecordReader(split, job, reporter); + return new HoodieFileGroupReaderRecordReader(super::getRecordReader, split, job, reporter); } - } catch (final InterruptedException | IOException e) { + } catch (final IOException e) { throw new RuntimeException("Cannot create a RecordReaderWrapper", e); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RecordReaderValueIterator.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RecordReaderValueIterator.java index 7ffa3bf555c03..c08c358c0c87d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RecordReaderValueIterator.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RecordReaderValueIterator.java @@ -18,6 +18,7 @@ package org.apache.hudi.hadoop; +import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.exception.HoodieException; import org.apache.hadoop.mapred.RecordReader; @@ -25,7 +26,6 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.Iterator; import java.util.NoSuchElementException; /** @@ -34,7 +34,7 @@ * @param Key Type * @param Value Type */ -public class RecordReaderValueIterator implements Iterator { +public class RecordReaderValueIterator implements ClosableIterator { private static final Logger LOG = LoggerFactory.getLogger(RecordReaderValueIterator.class); @@ -79,7 +79,12 @@ public V next() { return retVal; } - public void close() throws IOException { - this.reader.close(); + @Override + public void close() { + try { + this.reader.close(); + } catch (IOException e) { + throw new RuntimeException("Could not close reader", e); + } } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java index 058ca11a9a07d..8ab08e0772b5c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java @@ -103,6 +103,7 @@ public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) { throw new HoodieException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); } prepareHiveAvroSerializer(); + throw new HoodieException("don't want to use this reader"); } private boolean usesCustomPayload(HoodieTableMetaClient metaClient) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java index 1edf29d45d57e..338bbf00c8744 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java @@ -19,6 +19,7 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.hadoop.HoodieFileGroupReaderRecordReader; import org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit; import org.apache.hadoop.hive.ql.io.IOContextMap; @@ -42,9 +43,9 @@ public class HoodieCombineRealtimeRecordReader implements RecordReader recordReaders = new LinkedList<>(); + List recordReaders = new LinkedList<>(); // Points to the currently iterating record reader - HoodieRealtimeRecordReader currentRecordReader; + HoodieFileGroupReaderRecordReader currentRecordReader; public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split, List readers) { @@ -53,8 +54,9 @@ public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split .size(), "Num Splits does not match number of unique RecordReaders!"); for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { LOG.info("Creating new RealtimeRecordReader for split"); - recordReaders.add( - new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) rtSplit, jobConf, readers.remove(0))); + RecordReader reader = readers.remove(0); + ValidationUtils.checkArgument(reader instanceof HoodieFileGroupReaderRecordReader, reader.toString() + "not instance of HoodieFileGroupReaderRecordReader "); + recordReaders.add((HoodieFileGroupReaderRecordReader) reader); } currentRecordReader = recordReaders.remove(0); } catch (Exception e) { @@ -69,7 +71,7 @@ public boolean next(NullWritable key, ArrayWritable value) throws IOException { } else if (recordReaders.size() > 0) { this.currentRecordReader.close(); this.currentRecordReader = recordReaders.remove(0); - AbstractRealtimeRecordReader reader = (AbstractRealtimeRecordReader)currentRecordReader.getReader(); + HoodieFileGroupReaderRecordReader reader = currentRecordReader; // when switch reader, ioctx should be updated IOContextMap.get(reader.getJobConf()).setInputPath(reader.getSplit().getPath()); return next(key, value); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index 7e74171c3f985..7da6964997198 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -19,14 +19,12 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hudi.common.table.HoodieTableConfig; -import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; import org.apache.hudi.hadoop.UseRecordReaderFromInputFormat; -import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; @@ -70,20 +68,13 @@ public RecordReader getRecordReader(final InputSpli "HoodieRealtimeRecordReader can only work on RealtimeSplit and not with " + split); RealtimeSplit realtimeSplit = (RealtimeSplit) split; // add preCombineKey - HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() - .setConf(HadoopFSUtils.getStorageConfWithCopy(jobConf)).setBasePath(realtimeSplit.getBasePath()).build(); - HoodieTableConfig tableConfig = metaClient.getTableConfig(); - addProjectionToJobConf(realtimeSplit, jobConf, tableConfig); + //HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(realtimeSplit.getBasePath()).build(); + //HoodieTableConfig tableConfig = metaClient.getTableConfig(); + //addProjectionToJobConf(realtimeSplit, jobConf, tableConfig); LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); - // for log only split, set the parquet reader as empty. - if (HadoopFSUtils.isLogFile(realtimeSplit.getPath())) { - return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, new HoodieEmptyRecordReader(realtimeSplit, jobConf)); - } - - return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, - super.getRecordReader(split, jobConf, reporter)); + return super.getRecordReader(realtimeSplit, jobConf, reporter); } void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf jobConf, HoodieTableConfig tableConfig) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java new file mode 100644 index 0000000000000..50832b1908528 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop.utils; + +import org.apache.hudi.common.util.collection.Pair; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import org.apache.avro.Schema; +import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; + +import java.util.List; +import java.util.function.UnaryOperator; + +public class HoodieArrayWritableAvroUtils { + + private static final Cache + OBJECT_INSPECTOR_TABLE_CACHE = Caffeine.newBuilder().maximumSize(1000).build(); + + public static ObjectInspectorCache getCacheForTable(String table, Schema tableSchema, JobConf jobConf) { + ObjectInspectorCache cache = OBJECT_INSPECTOR_TABLE_CACHE.getIfPresent(table); + if (cache == null) { + cache = new ObjectInspectorCache(tableSchema, jobConf); + } + return cache; + } + + private static final Cache, int[]> + PROJECTION_CACHE = Caffeine.newBuilder().maximumSize(1000).build(); + + public static int[] getProjection(Schema from, Schema to) { + return PROJECTION_CACHE.get(Pair.of(from, to), schemas -> { + List toFields = to.getFields(); + int[] newProjection = new int[toFields.size()]; + for (int i = 0; i < newProjection.length; i++) { + newProjection[i] = from.getField(toFields.get(i).name()).pos(); + } + return newProjection; + }); + } + + public static UnaryOperator projectRecord(Schema from, Schema to) { + int[] projection = getProjection(from, to); + return arrayWritable -> { + Writable[] values = new Writable[arrayWritable.get().length]; + for (int i = 0; i < projection.length; i++) { + values[i] = arrayWritable.get()[projection[i]]; + } + arrayWritable.set(values); + return arrayWritable; + }; + } + + public static int[] getReverseProjection(Schema from, Schema to) { + return PROJECTION_CACHE.get(Pair.of(from, to), schemas -> { + List fromFields = from.getFields(); + int[] newProjection = new int[fromFields.size()]; + for (int i = 0; i < newProjection.length; i++) { + newProjection[i] = to.getField(fromFields.get(i).name()).pos(); + } + return newProjection; + }); + } + + public static UnaryOperator reverseProject(Schema from, Schema to) { + int[] projection = getReverseProjection(from, to); + return arrayWritable -> { + Writable[] values = new Writable[to.getFields().size()]; + for (int i = 0; i < projection.length; i++) { + values[projection[i]] = arrayWritable.get()[i]; + } + arrayWritable.set(values); + return arrayWritable; + }; + } + + public static Object getWritableValue(ArrayWritable arrayWritable, ArrayWritableObjectInspector objectInspector, String name) { + return objectInspector.getStructFieldData(arrayWritable, objectInspector.getStructFieldRef(name)); + } +} + + diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java new file mode 100644 index 0000000000000..b3a22fbac25a2 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop.utils; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import org.apache.avro.Schema; +import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapred.JobConf; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class ObjectInspectorCache { + private final Map columnTypeMap = new HashMap<>(); + private final Cache + objectInspectorCache = Caffeine.newBuilder().maximumSize(1000).build(); + + public Map getColumnTypeMap() { + return columnTypeMap; + } + + public ObjectInspectorCache(Schema tableSchema, JobConf jobConf) { + //From AbstractRealtimeRecordReader#prepareHiveAvroSerializer + // hive will append virtual columns at the end of column list. we should remove those columns. + // eg: current table is col1, col2, col3; jobConf.get(serdeConstants.LIST_COLUMNS): col1, col2, col3 ,BLOCK__OFFSET__INSIDE__FILE ... + Set writerSchemaColNames = tableSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).collect(Collectors.toSet()); + List columnNameList = Arrays.stream(jobConf.get(serdeConstants.LIST_COLUMNS).split(",")).collect(Collectors.toList()); + System.out.println("Table schema is " + tableSchema); + System.out.println("Column Name List is: " + jobConf.get(serdeConstants.LIST_COLUMNS)); + List columnTypeList = TypeInfoUtils.getTypeInfosFromTypeString(jobConf.get(serdeConstants.LIST_COLUMN_TYPES)); + + int columnNameListLen = columnNameList.size() - 1; + for (int i = columnNameListLen; i >= 0; i--) { + String lastColName = columnNameList.get(columnNameList.size() - 1); + // virtual columns will only append at the end of column list. it will be ok to break the loop. + if (writerSchemaColNames.contains(lastColName)) { + break; + } + columnNameList.remove(columnNameList.size() - 1); + columnTypeList.remove(columnTypeList.size() - 1); + } + + //Use columnNameList.size() instead of columnTypeList because the type list is longer for some reason + IntStream.range(0, columnNameList.size()).boxed().forEach(i -> columnTypeMap.put(columnNameList.get(i), + TypeInfoUtils.getTypeInfosFromTypeString(columnTypeList.get(i).getQualifiedName()).get(0))); + + StructTypeInfo rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNameList, columnTypeList); + ArrayWritableObjectInspector objectInspector = new ArrayWritableObjectInspector(rowTypeInfo); + objectInspectorCache.put(tableSchema, objectInspector); + } + + public ArrayWritableObjectInspector getObjectInspector(Schema schema) { + return objectInspectorCache.get(schema, s -> { + List columnNameList = s.getFields().stream().map(Schema.Field::name).collect(Collectors.toList()); + List columnTypeList = columnNameList.stream().map(columnTypeMap::get).collect(Collectors.toList()); + StructTypeInfo rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNameList, columnTypeList); + return new ArrayWritableObjectInspector(rowTypeInfo); + }); + + } + + public Object getValue(ArrayWritable record, Schema schema, String fieldName) { + ArrayWritableObjectInspector objectInspector = getObjectInspector(schema); + return objectInspector.getStructFieldData(record, objectInspector.getStructFieldRef(fieldName)); + } +} From 36743900217df44a25468d80d2b550c7a604cad6 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Wed, 27 Dec 2023 21:22:08 -0500 Subject: [PATCH 02/41] refactor a bit and add some comments --- .../common/engine/HoodieReaderContext.java | 5 +- .../table/read/HoodieFileGroupReader.java | 62 +++++++++---------- .../reader/HoodieTestReaderContext.java | 5 +- .../hudi/hadoop/HiveHoodieReaderContext.java | 23 +++---- .../HoodieFileGroupReaderRecordReader.java | 9 +-- .../utils/HoodieArrayWritableAvroUtils.java | 8 +++ 6 files changed, 61 insertions(+), 51 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java index c61d63e10f1b8..22ab06b354b3b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java @@ -219,7 +219,10 @@ public Map updateSchemaAndResetOrderingValInMetadata(Map mergeBootstrapReaders(ClosableIterator skeletonFileIterator, ClosableIterator dataFileIterator); + public abstract ClosableIterator mergeBootstrapReaders(ClosableIterator skeletonFileIterator, + Schema skeletonRequiredSchema, + ClosableIterator dataFileIterator, + Schema dataRequiredSchema); /** * Creates a function that will reorder records of schema "from" to schema of "to" diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupReader.java index e4514f8dbdc76..56173ae0e60b2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupReader.java @@ -67,7 +67,6 @@ * in Spark and {@code RowData} in Flink. */ public final class HoodieFileGroupReader implements Closeable { - private boolean schemaInOrder = true; private final HoodieReaderContext readerContext; private final Option hoodieBaseFileOption; private final List logFiles; @@ -175,6 +174,10 @@ private ClosableIterator makeBaseFileIterator() throws IOException { } private Schema generateRequiredSchema() { + return maybeReorderForBootstrap(generateRequiredSchemaHelper()); + } + + private Schema generateRequiredSchemaHelper() { //might need to change this if other queries than mor have mandatory fields if (logFiles.isEmpty()) { return requestedSchema; @@ -193,17 +196,13 @@ private Schema generateRequiredSchema() { } if (addedFields.isEmpty()) { - return maybeReorderForBootstrap(requestedSchema); + return requestedSchema; } - return maybeReorderForBootstrap(appendFieldsToSchema(requestedSchema, addedFields)); + return appendFieldsToSchema(requestedSchema, addedFields); } private Schema maybeReorderForBootstrap(Schema input) { - if (schemaInOrder) { - return createSchemaFromFields(input.getFields().stream() - .sorted((o1, o2) -> Integer.compare(dataSchema.getField(o1.name()).pos(),dataSchema.getField(o2.name()).pos())).collect(Collectors.toList())); - } if (this.hoodieBaseFileOption.isPresent() && this.hoodieBaseFileOption.get().getBootstrapBaseFile().isPresent()) { Pair, List> requiredFields = getDataAndMetaCols(input); if (!(requiredFields.getLeft().isEmpty() || requiredFields.getRight().isEmpty())) { @@ -234,35 +233,36 @@ private Schema createSchemaFromFields(List fields) { private ClosableIterator makeBootstrapBaseFileIterator(HoodieBaseFile baseFile) throws IOException { BaseFile dataFile = baseFile.getBootstrapBaseFile().get(); - Pair, List> requiredFields = - getDataAndMetaCols(requiredSchema); - Pair, List> allFields = getDataAndMetaCols(dataSchema); - - Option> dataFileIterator = - requiredFields.getRight().isEmpty() ? Option.empty() : - Option.of(readerContext.getFileRecordIterator( - dataFile.getStoragePath(), 0, - dataFile.getFileLen(), - createSchemaFromFields(allFields.getRight()), - createSchemaFromFields(requiredFields.getRight()), storage)); - - Option> skeletonFileIterator = - requiredFields.getLeft().isEmpty() ? Option.empty() : - Option.of(readerContext.getFileRecordIterator( - baseFile.getStoragePath(), 0, - baseFile.getFileLen(), - createSchemaFromFields(allFields.getLeft()), - createSchemaFromFields(requiredFields.getLeft()), storage)); + Pair,List> requiredFields = getDataAndMetaCols(requiredSchema); + Pair,List> allFields = getDataAndMetaCols(dataSchema); + Option,Schema>> dataFileIterator = + makeBootstrapBaseFileIteratorHelper(requiredFields.getRight(), allFields.getRight(), dataFile); + Option,Schema>> skeletonFileIterator = + makeBootstrapBaseFileIteratorHelper(requiredFields.getLeft(), allFields.getLeft(), baseFile); if (!dataFileIterator.isPresent() && !skeletonFileIterator.isPresent()) { throw new IllegalStateException("should not be here if only partition cols are required"); } else if (!dataFileIterator.isPresent()) { - return skeletonFileIterator.get(); + return skeletonFileIterator.get().getLeft(); } else if (!skeletonFileIterator.isPresent()) { - return dataFileIterator.get(); + return dataFileIterator.get().getLeft(); } else { - return readerContext.mergeBootstrapReaders(skeletonFileIterator.get(), - dataFileIterator.get()); + return readerContext.mergeBootstrapReaders(skeletonFileIterator.get().getLeft(), skeletonFileIterator.get().getRight(), + dataFileIterator.get().getLeft(), dataFileIterator.get().getRight()); + } + } + + private Option,Schema>> makeBootstrapBaseFileIteratorHelper(List requiredFields, + List allFields, + BaseFile file) throws IOException { + if (requiredFields.isEmpty()) { + return Option.empty(); } + Schema requiredSchema = createSchemaFromFields(requiredFields); + return Option.of(Pair.of(readerContext.getFileRecordIterator( + file.getStoragePath(), 0, + file.getFileLen(), + createSchemaFromFields(allFields), + createSchemaFromFields(requiredFields), storage), requiredSchema)); } /** @@ -289,8 +289,6 @@ public T next() { } private void scanLogFiles() { - System.out.println("Scanning log files"); - String path = readerState.tablePath; HoodieMergedLogRecordReader logRecordReader = HoodieMergedLogRecordReader.newBuilder() .withHoodieReaderContext(readerContext) .withStorage(storage) diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java index 083738827735a..1f122968c16ad 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java @@ -169,9 +169,8 @@ public IndexedRecord seal(IndexedRecord record) { } @Override - public ClosableIterator mergeBootstrapReaders( - ClosableIterator skeletonFileIterator, - ClosableIterator dataFileIterator) { + public ClosableIterator mergeBootstrapReaders(ClosableIterator skeletonFileIterator, Schema skeletonRequiredSchema, + ClosableIterator dataFileIterator, Schema dataRequiredSchema) { return null; } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 11873aded1cf4..2477b5b2c7c87 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -68,11 +68,7 @@ public class HiveHoodieReaderContext extends HoodieReaderContext protected final Schema writerSchema; protected Map hosts; protected final Map columnTypeMap; - private final ObjectInspectorCache objectInspectorCache; - - private final String tableName; - private RecordReader firstRecordReader = null; protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator, @@ -88,7 +84,7 @@ protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCr this.reporter = reporter; this.writerSchema = writerSchema; this.hosts = hosts; - this.tableName = metaClient.getTableConfig().getTableName(); + String tableName = metaClient.getTableConfig().getTableName(); this.objectInspectorCache = HoodieArrayWritableAvroUtils.getCacheForTable(tableName, writerSchema, jobConf); this.columnTypeMap = objectInspectorCache.getColumnTypeMap(); } @@ -111,6 +107,7 @@ public ClosableIterator getFileRecordIterator(Path filePath, long if (dataSchema.equals(requiredSchema)) { return recordIterator; } + //The record reader puts the required columns in the positions of the data schema and nulls the rest of the columns return new CloseableMappingIterator<>(recordIterator, projectRecord(dataSchema, requiredSchema)); } @@ -119,6 +116,7 @@ private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchem List dataColumnTypeList = dataColumnNameList.stream().map(columnTypeMap::get).collect(Collectors.toList()); jobConf.set(serdeConstants.LIST_COLUMNS, String.join(",", dataColumnNameList)); jobConf.set(serdeConstants.LIST_COLUMN_TYPES, dataColumnTypeList.stream().map(TypeInfo::getQualifiedName).collect(Collectors.joining(","))); + //don't replace `f -> f.name()` with lambda reference String readColNames = requiredSchema.getFields().stream().map(f -> f.name()).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, requiredSchema.getFields() @@ -128,8 +126,7 @@ private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchem @Override public ArrayWritable convertAvroRecord(IndexedRecord avroRecord) { //should be support timestamp? - ArrayWritable convertedRecord = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(avroRecord, avroRecord.getSchema(), false); - return convertedRecord; + return (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(avroRecord, avroRecord.getSchema(), false); } @Override @@ -164,7 +161,11 @@ public ArrayWritable seal(ArrayWritable record) { @Override public ClosableIterator mergeBootstrapReaders(ClosableIterator skeletonFileIterator, - ClosableIterator dataFileIterator) { + Schema skeletonRequiredSchema, + ClosableIterator dataFileIterator, + Schema dataRequiredSchema) { + int skeletonLen = skeletonRequiredSchema.getFields().size(); + int dataLen = dataRequiredSchema.getFields().size(); return new ClosableIterator() { private final ArrayWritable returnWritable = new ArrayWritable(Writable.class); @@ -180,9 +181,9 @@ public boolean hasNext() { public ArrayWritable next() { Writable[] skeletonWritable = skeletonFileIterator.next().get(); Writable[] dataWritable = dataFileIterator.next().get(); - Writable[] mergedWritable = new Writable[skeletonWritable.length + dataWritable.length]; - System.arraycopy(skeletonWritable, 0, mergedWritable, 0, skeletonWritable.length); - System.arraycopy(dataWritable, 0, mergedWritable, skeletonWritable.length, dataWritable.length); + Writable[] mergedWritable = new Writable[skeletonLen + dataLen]; + System.arraycopy(skeletonWritable, 0, mergedWritable, 0, skeletonLen); + System.arraycopy(dataWritable, 0, mergedWritable, skeletonLen, dataLen); returnWritable.set(mergedWritable); return returnWritable; } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index 860ab85c5b0e2..b8c9df3c59fba 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -72,14 +72,11 @@ org.apache.hadoop.mapred.RecordReader getRecordRead } private final HiveHoodieReaderContext readerContext; - private final HoodieFileGroupReader fileGroupReader; private final ArrayWritable arrayWritable; private final NullWritable nullWritable = NullWritable.get(); - private final InputSplit inputSplit; private final JobConf jobConf; - private final UnaryOperator reverseProjection; public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, @@ -193,6 +190,9 @@ private static String getLatestCommitTime(InputSplit split, HoodieTableMetaClien } } + /** + * Convert FileSplit to FileSlice, but save the locations in 'hosts' because that data is otherwise lost. + */ private static FileSlice getFileSliceFromSplit(FileSplit split, Map hosts, FileSystem fs, String tableBasePath) throws IOException { if (split instanceof RealtimeSplit) { RealtimeSplit realtimeSplit = (RealtimeSplit) split; @@ -218,10 +218,11 @@ private static FileSlice getFileSliceFromSplit(FileSplit split, Map partitionColumns; if (partitionColString == null) { - partitionColumns = Collections.EMPTY_SET; + partitionColumns = Collections.emptySet(); } else { partitionColumns = Arrays.stream(partitionColString.split(",")).collect(Collectors.toSet()); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java index 50832b1908528..a2da796c6f776 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java @@ -59,6 +59,10 @@ public static int[] getProjection(Schema from, Schema to) { }); } + /** + * Projection will keep the size from the "from" schema because it gets recycled + * and if the size changes the reader will fail + */ public static UnaryOperator projectRecord(Schema from, Schema to) { int[] projection = getProjection(from, to); return arrayWritable -> { @@ -82,6 +86,10 @@ public static int[] getReverseProjection(Schema from, Schema to) { }); } + /** + * After the reading and merging etc is done, we need to put the records + * into the positions of the original schema + */ public static UnaryOperator reverseProject(Schema from, Schema to) { int[] projection = getReverseProjection(from, to); return arrayWritable -> { From edad863caab56307933963254260f35542e489f3 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Wed, 27 Dec 2023 22:26:39 -0500 Subject: [PATCH 03/41] make build properly --- .../hudi/SparkFileFormatInternalRowReaderContext.scala | 4 +++- .../org/apache/hudi/hadoop/HiveHoodieReaderContext.java | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala index c3d49ca47a89a..1de8c35650549 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala @@ -101,7 +101,9 @@ class SparkFileFormatInternalRowReaderContext(readerMaps: mutable.Map[Long, Part } override def mergeBootstrapReaders(skeletonFileIterator: ClosableIterator[InternalRow], - dataFileIterator: ClosableIterator[InternalRow]): ClosableIterator[InternalRow] = { + skeletonRequiredSchema: Schema, + dataFileIterator: ClosableIterator[InternalRow], + dataRequiredSchema: Schema): ClosableIterator[InternalRow] = { doBootstrapMerge(skeletonFileIterator.asInstanceOf[ClosableIterator[Any]], dataFileIterator.asInstanceOf[ClosableIterator[Any]]) } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 2477b5b2c7c87..785e09d72c0c1 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -161,9 +161,9 @@ public ArrayWritable seal(ArrayWritable record) { @Override public ClosableIterator mergeBootstrapReaders(ClosableIterator skeletonFileIterator, - Schema skeletonRequiredSchema, - ClosableIterator dataFileIterator, - Schema dataRequiredSchema) { + Schema skeletonRequiredSchema, + ClosableIterator dataFileIterator, + Schema dataRequiredSchema) { int skeletonLen = skeletonRequiredSchema.getFields().size(); int dataLen = dataRequiredSchema.getFields().size(); return new ClosableIterator() { From b7640263a1807df2a06a58fcc2d3c52e6aaf752e Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 11:33:15 -0500 Subject: [PATCH 04/41] fix some of the failing tests --- .../common/testutils/HoodieTestUtils.java | 6 +++++ .../HoodieFileGroupReaderRecordReader.java | 24 +++++++++++-------- .../apache/hudi/hadoop/HoodieHiveRecord.java | 4 ++++ .../hadoop/utils/ObjectInspectorCache.java | 2 -- .../apache/hudi/functional/TestBootstrap.java | 1 + .../TestSparkConsistentBucketClustering.java | 2 +- 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index 4d4fc57cc09cd..102ee61ac2c33 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -29,6 +29,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.storage.HoodieStorage; @@ -181,11 +182,16 @@ public static HoodieTableMetaClient init(StorageConfiguration storageConf, St public static HoodieTableMetaClient init(StorageConfiguration storageConf, String basePath, HoodieTableType tableType, Properties properties, String databaseName) throws IOException { + String preCombineField = ConfigUtils.getOrderingField(properties); + if (preCombineField == null && tableType.equals(HoodieTableType.MERGE_ON_READ)) { + preCombineField = "timestamp"; + } HoodieTableMetaClient.PropertyBuilder builder = HoodieTableMetaClient.withPropertyBuilder() .setDatabaseName(databaseName) .setTableName(RAW_TRIPS_TEST_NAME) .setTableType(tableType) + .setPreCombineField(preCombineField) .setPayloadClass(HoodieAvroPayload.class); String keyGen = properties.getProperty("hoodie.datasource.write.keygenerator.class"); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index b8c9df3c59fba..b603e737c1589 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -31,7 +31,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.TablePathUtils; -import org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit; import org.apache.hudi.hadoop.realtime.RealtimeSplit; import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; @@ -194,27 +193,32 @@ private static String getLatestCommitTime(InputSplit split, HoodieTableMetaClien * Convert FileSplit to FileSlice, but save the locations in 'hosts' because that data is otherwise lost. */ private static FileSlice getFileSliceFromSplit(FileSplit split, Map hosts, FileSystem fs, String tableBasePath) throws IOException { + BaseFile bootstrapBaseFile = createBootstrapBaseFile(split, hosts, fs); if (split instanceof RealtimeSplit) { + //mor RealtimeSplit realtimeSplit = (RealtimeSplit) split; HoodieFileGroupId fileGroupId = new HoodieFileGroupId(FSUtils.getFileId(realtimeSplit.getPath().getName()), FSUtils.getPartitionPath(realtimeSplit.getBasePath(), realtimeSplit.getPath().getParent().toString()).toString()); String commitTime = FSUtils.getCommitTime(realtimeSplit.getPath().toString()); - BaseFile bootstrapBaseFile = null; - if (realtimeSplit instanceof HoodieRealtimeBootstrapBaseFileSplit) { - HoodieRealtimeBootstrapBaseFileSplit hoodieRealtimeBootstrapBaseFileSplit = (HoodieRealtimeBootstrapBaseFileSplit) realtimeSplit; - FileSplit bootstrapBaseFileSplit = hoodieRealtimeBootstrapBaseFileSplit.getBootstrapFileSplit(); - hosts.put(bootstrapBaseFileSplit.getPath().toString(), bootstrapBaseFileSplit.getLocations()); - bootstrapBaseFile = new BaseFile(fs.getFileStatus(bootstrapBaseFileSplit.getPath())); - } hosts.put(realtimeSplit.getPath().toString(), realtimeSplit.getLocations()); HoodieBaseFile hoodieBaseFile = new HoodieBaseFile(fs.getFileStatus(realtimeSplit.getPath()), bootstrapBaseFile); return new FileSlice(fileGroupId, commitTime, hoodieBaseFile, realtimeSplit.getDeltaLogFiles()); } - //just regular cow + //cow HoodieFileGroupId fileGroupId = new HoodieFileGroupId(FSUtils.getFileId(split.getPath().getName()), FSUtils.getPartitionPath(tableBasePath, split.getPath().getParent().toString()).toString()); hosts.put(split.getPath().toString(), split.getLocations()); - return new FileSlice(fileGroupId, FSUtils.getCommitTime(split.getPath().toString()), new HoodieBaseFile(fs.getFileStatus(split.getPath())), Collections.emptyList()); + return new FileSlice(fileGroupId, FSUtils.getCommitTime(split.getPath().toString()), new HoodieBaseFile(fs.getFileStatus(split.getPath()), bootstrapBaseFile), Collections.emptyList()); + } + + private static BaseFile createBootstrapBaseFile(FileSplit split, Map hosts, FileSystem fs) throws IOException { + if (split instanceof BootstrapBaseFileSplit) { + BootstrapBaseFileSplit bootstrapBaseFileSplit = (BootstrapBaseFileSplit) split; + FileSplit bootstrapFileSplit = bootstrapBaseFileSplit.getBootstrapFileSplit(); + hosts.put(bootstrapFileSplit.getPath().toString(), bootstrapFileSplit.getLocations()); + return new BaseFile(fs.getFileStatus(bootstrapFileSplit.getPath())); + } + return null; } private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java index 702f4880d79a5..2ab77cc98be92 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java @@ -115,6 +115,10 @@ public HoodieRecord newInstance(HoodieKey key) { @Override public Comparable getOrderingValue(Schema recordSchema, Properties props) { + String orderingField = ConfigUtils.getOrderingField(props); + if (orderingField == null) { + throw new IllegalArgumentException("Ordering Field is not set. Precombine must be set. (If you are using a custom record merger it might be something else)"); + } return (Comparable) getValue(ConfigUtils.getOrderingField(props)); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java index b3a22fbac25a2..47564ae8caa0e 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java @@ -55,8 +55,6 @@ public ObjectInspectorCache(Schema tableSchema, JobConf jobConf) { // eg: current table is col1, col2, col3; jobConf.get(serdeConstants.LIST_COLUMNS): col1, col2, col3 ,BLOCK__OFFSET__INSIDE__FILE ... Set writerSchemaColNames = tableSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).collect(Collectors.toSet()); List columnNameList = Arrays.stream(jobConf.get(serdeConstants.LIST_COLUMNS).split(",")).collect(Collectors.toList()); - System.out.println("Table schema is " + tableSchema); - System.out.println("Column Name List is: " + jobConf.get(serdeConstants.LIST_COLUMNS)); List columnTypeList = TypeInfoUtils.getTypeInfosFromTypeString(jobConf.get(serdeConstants.LIST_COLUMN_TYPES)); int columnNameListLen = columnNameList.size() - 1; diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java index b6795bc2a2ae3..99fcdcbf8a339 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java @@ -253,6 +253,7 @@ private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, Effec long timestamp = Instant.now().toEpochMilli(); Schema schema = generateNewDataSetAndReturnSchema(timestamp, totalRecords, partitions, bootstrapBasePath); HoodieWriteConfig config = getConfigBuilder(schema.toString()) + .withPreCombineField("timestamp") .withAutoCommit(true) .withSchema(schema.toString()) .withKeyGenerator(keyGeneratorClass) diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkConsistentBucketClustering.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkConsistentBucketClustering.java index 723d2389d22e0..2f2d2ba0efafe 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkConsistentBucketClustering.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkConsistentBucketClustering.java @@ -219,7 +219,7 @@ public void testLoadMetadata(boolean isCommitFilePresent, boolean rowWriterEnabl */ @ParameterizedTest @MethodSource("configParamsForSorting") - public void testClusteringColumnSort(String sortColumn, boolean rowWriterEnable) throws IOException { + public void testClusteringColumnSort(String sortColumn, boolean rowWriterEnable) throws Exception { Map options = new HashMap<>(); // Record key is handled specially if (sortColumn.equals("_row_key")) { From 664786915c1b3a5ccb953dcdc86c34c53762ff96 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 12:20:04 -0500 Subject: [PATCH 05/41] revert to old impl when schema evolution enabled --- .../HoodieFileGroupReaderRecordReader.java | 9 +- .../hudi/hadoop/HoodieParquetInputFormat.java | 39 ++++-- .../AbstractRealtimeRecordReader.java | 5 +- .../HoodieCombineRealtimeRecordReader.java | 114 +++++++++++++----- .../HoodieParquetRealtimeInputFormat.java | 22 +++- .../TestHiveTableSchemaEvolution.java | 2 + 6 files changed, 146 insertions(+), 45 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index b603e737c1589..e718ef20564b6 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -20,6 +20,8 @@ package org.apache.hudi.hadoop; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.config.HoodieReaderConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.FileSlice; @@ -165,7 +167,7 @@ private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, Job } } - private static String getTableBasePath(InputSplit split, JobConf jobConf) throws IOException { + public static String getTableBasePath(InputSplit split, JobConf jobConf) throws IOException { if (split instanceof RealtimeSplit) { RealtimeSplit realtimeSplit = (RealtimeSplit) split; return realtimeSplit.getBasePath(); @@ -233,4 +235,9 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) return HoodieAvroUtils.generateProjectionSchema(tableSchema, Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); } + + public static boolean useFilegroupReader(final JobConf jobConf) { + return jobConf.getBoolean(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), HoodieReaderConfig.FILE_GROUP_READER_ENABLED.defaultValue()) + && !jobConf.getBoolean(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue()); + } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index a61ac3709f4b9..286b1f8607d42 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -23,7 +23,9 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.avro.HoodieTimestampAwareParquetInputFormat; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.plan.TableScanDesc; @@ -91,6 +93,24 @@ private void initAvroInputFormat() { @Override public RecordReader getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { + + if (HoodieFileGroupReaderRecordReader.useFilegroupReader(job)) { + try { + if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { + return new HoodieFileGroupReaderRecordReader((s, j, r) -> { + try { + return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), s, j, r); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }, split, job, reporter); + } else { + return new HoodieFileGroupReaderRecordReader(super::getRecordReader, split, job, reporter); + } + } catch (final IOException e) { + throw new RuntimeException("Cannot create a RecordReaderWrapper", e); + } + } // TODO enable automatic predicate pushdown after fixing issues // FileSplit fileSplit = (FileSplit) split; // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent()); @@ -103,6 +123,9 @@ public RecordReader getRecordReader(final InputSpli // ParquetInputFormat.setFilterPredicate(job, predicate); // clearOutExistingPredicate(job); // } + if (split instanceof BootstrapBaseFileSplit) { + return createBootstrappingRecordReader(split, job, reporter); + } // adapt schema evolution new SchemaEvolutionContext(split, job).doEvolutionForParquetFormat(); @@ -111,26 +134,20 @@ public RecordReader getRecordReader(final InputSpli LOG.debug("EMPLOYING DEFAULT RECORD READER - " + split); } - //HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/")); + HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/")); return getRecordReaderInternal(split, job, reporter); } private RecordReader getRecordReaderInternal(InputSplit split, JobConf job, - Reporter reporter) { + Reporter reporter) throws IOException { try { if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { - return new HoodieFileGroupReaderRecordReader((s, j, r) -> { - try { - return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), s, j, r); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - }, split, job, reporter); + return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), split, job, reporter); } else { - return new HoodieFileGroupReaderRecordReader(super::getRecordReader, split, job, reporter); + return super.getRecordReader(split, job, reporter); } - } catch (final IOException e) { + } catch (final InterruptedException | IOException e) { throw new RuntimeException("Cannot create a RecordReaderWrapper", e); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java index 8ab08e0772b5c..88eef5cc57ead 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.HoodieColumnProjectionUtils; +import org.apache.hudi.hadoop.HoodieFileGroupReaderRecordReader; import org.apache.hudi.hadoop.SchemaEvolutionContext; import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.hadoop.utils.HiveAvroSerializer; @@ -103,7 +104,9 @@ public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) { throw new HoodieException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); } prepareHiveAvroSerializer(); - throw new HoodieException("don't want to use this reader"); + if (HoodieFileGroupReaderRecordReader.useFilegroupReader(jobConf)) { + throw new IllegalStateException("Should not be here, should be using filegroup reader"); + } } private boolean usesCustomPayload(HoodieTableMetaClient metaClient) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java index 338bbf00c8744..b666161d76950 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java @@ -43,65 +43,123 @@ public class HoodieCombineRealtimeRecordReader implements RecordReader recordReaders = new LinkedList<>(); + private List recordReaders = new LinkedList<>(); // Points to the currently iterating record reader - HoodieFileGroupReaderRecordReader currentRecordReader; + private HoodieRealtimeRecordReader currentRecordReader; + + private final boolean useFileGroupReader; + + // RecordReaders for each split + private List recordReadersFG = new LinkedList<>(); + // Points to the currently iterating record reader + private HoodieFileGroupReaderRecordReader currentRecordReaderFG; public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split, List readers) { - try { - ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers - .size(), "Num Splits does not match number of unique RecordReaders!"); - for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { - LOG.info("Creating new RealtimeRecordReader for split"); - RecordReader reader = readers.remove(0); - ValidationUtils.checkArgument(reader instanceof HoodieFileGroupReaderRecordReader, reader.toString() + "not instance of HoodieFileGroupReaderRecordReader "); - recordReaders.add((HoodieFileGroupReaderRecordReader) reader); + useFileGroupReader = HoodieFileGroupReaderRecordReader.useFilegroupReader(jobConf); + if (useFileGroupReader) { + try { + ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers + .size(), "Num Splits does not match number of unique RecordReaders!"); + for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { + LOG.info("Creating new RealtimeRecordReader for split"); + RecordReader reader = readers.remove(0); + ValidationUtils.checkArgument(reader instanceof HoodieFileGroupReaderRecordReader, reader.toString() + "not instance of HoodieFileGroupReaderRecordReader "); + recordReadersFG.add((HoodieFileGroupReaderRecordReader) reader); + } + currentRecordReaderFG = recordReadersFG.remove(0); + } catch (Exception e) { + throw new RuntimeException(e); + } + } else { + try { + ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers + .size(), "Num Splits does not match number of unique RecordReaders!"); + for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { + LOG.info("Creating new RealtimeRecordReader for split"); + recordReaders.add( + new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) rtSplit, jobConf, readers.remove(0))); + } + currentRecordReader = recordReaders.remove(0); + } catch (Exception e) { + throw new RuntimeException(e); } - currentRecordReader = recordReaders.remove(0); - } catch (Exception e) { - throw new RuntimeException(e); } } @Override public boolean next(NullWritable key, ArrayWritable value) throws IOException { - if (this.currentRecordReader.next(key, value)) { - return true; - } else if (recordReaders.size() > 0) { - this.currentRecordReader.close(); - this.currentRecordReader = recordReaders.remove(0); - HoodieFileGroupReaderRecordReader reader = currentRecordReader; - // when switch reader, ioctx should be updated - IOContextMap.get(reader.getJobConf()).setInputPath(reader.getSplit().getPath()); - return next(key, value); + if (useFileGroupReader) { + if (this.currentRecordReaderFG.next(key, value)) { + return true; + } else if (recordReadersFG.size() > 0) { + this.currentRecordReaderFG.close(); + this.currentRecordReaderFG = recordReadersFG.remove(0); + HoodieFileGroupReaderRecordReader reader = currentRecordReaderFG; + // when switch reader, ioctx should be updated + IOContextMap.get(reader.getJobConf()).setInputPath(reader.getSplit().getPath()); + return next(key, value); + } else { + return false; + } } else { - return false; + if (this.currentRecordReader.next(key, value)) { + return true; + } else if (recordReaders.size() > 0) { + this.currentRecordReader.close(); + this.currentRecordReader = recordReaders.remove(0); + AbstractRealtimeRecordReader reader = (AbstractRealtimeRecordReader)currentRecordReader.getReader(); + // when switch reader, ioctx should be updated + IOContextMap.get(reader.getJobConf()).setInputPath(reader.getSplit().getPath()); + return next(key, value); + } else { + return false; + } } } @Override public NullWritable createKey() { - return this.currentRecordReader.createKey(); + if (useFileGroupReader) { + return this.currentRecordReaderFG.createKey(); + } else { + return this.currentRecordReader.createKey(); + } } @Override public ArrayWritable createValue() { - return this.currentRecordReader.createValue(); + if (useFileGroupReader) { + return this.currentRecordReaderFG.createValue(); + } else { + return this.currentRecordReader.createValue(); + } } @Override public long getPos() throws IOException { - return this.currentRecordReader.getPos(); + if (useFileGroupReader) { + return this.currentRecordReaderFG.getPos(); + } else { + return this.currentRecordReader.getPos(); + } } @Override public void close() throws IOException { - this.currentRecordReader.close(); + if (useFileGroupReader) { + this.currentRecordReaderFG.close(); + } else { + this.currentRecordReader.close(); + } } @Override public float getProgress() throws IOException { - return this.currentRecordReader.getProgress(); + if (useFileGroupReader) { + return this.currentRecordReaderFG.getProgress(); + } else { + return this.currentRecordReader.getProgress(); + } } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index 7da6964997198..3974a4c7e3bd4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -18,10 +18,13 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.hadoop.HoodieFileGroupReaderRecordReader; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; import org.apache.hudi.hadoop.UseRecordReaderFromInputFormat; @@ -67,14 +70,25 @@ public RecordReader getRecordReader(final InputSpli ValidationUtils.checkArgument(split instanceof RealtimeSplit, "HoodieRealtimeRecordReader can only work on RealtimeSplit and not with " + split); RealtimeSplit realtimeSplit = (RealtimeSplit) split; + + if (HoodieFileGroupReaderRecordReader.useFilegroupReader(jobConf)) { + return super.getRecordReader(realtimeSplit, jobConf, reporter); + } + // add preCombineKey - //HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(realtimeSplit.getBasePath()).build(); - //HoodieTableConfig tableConfig = metaClient.getTableConfig(); - //addProjectionToJobConf(realtimeSplit, jobConf, tableConfig); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(realtimeSplit.getBasePath()).build(); + HoodieTableConfig tableConfig = metaClient.getTableConfig(); + addProjectionToJobConf(realtimeSplit, jobConf, tableConfig); LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); - return super.getRecordReader(realtimeSplit, jobConf, reporter); + // for log only split, set the parquet reader as empty. + if (FSUtils.isLogFile(realtimeSplit.getPath())) { + return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, new HoodieEmptyRecordReader(realtimeSplit, jobConf)); + } + + return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, + super.getRecordReader(split, jobConf, reporter)); } void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf jobConf, HoodieTableConfig tableConfig) { diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java index a5a45cabf81dc..806f775442317 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java @@ -19,6 +19,7 @@ package org.apache.hudi.functional; import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; @@ -106,6 +107,7 @@ public void testHiveReadSchemaEvolutionTable(String tableType) throws Exception spark.sql(String.format("alter table %s rename column col2 to col2_new", tableName)); JobConf jobConf = new JobConf(); + jobConf.set(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), "true"); jobConf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false"); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "col1,col2_new"); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "6,7"); From b43949b4db2427c4f8e9207eab3dc169f4f52b17 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 12:31:29 -0500 Subject: [PATCH 06/41] disable fg reader for stupid test --- .../hadoop/TestHoodieParquetInputFormat.java | 99 ++++++++++--------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java index 08cd33c2d56ed..e13d284606f12 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.common.config.HoodieReaderConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; @@ -777,59 +778,65 @@ private void ensureRecordsInCommit(String msg, String commit, int expectedNumber @Test public void testHoodieParquetInputFormatReadTimeType() throws IOException { - long testTimestampLong = System.currentTimeMillis(); - int testDate = 19116;// 2022-05-04 - - Schema schema = SchemaTestUtil.getSchemaFromResource(getClass(), "/test_timetype.avsc"); - String commit = "20160628071126"; - HoodieTestUtils.init(HoodieTestUtils.getDefaultStorageConf(), basePath.toString(), - HoodieTableType.COPY_ON_WRITE, HoodieFileFormat.PARQUET); - java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "06", "28")); - String fileId = FSUtils.makeBaseFileName(commit, "1-0-1", "fileid1", - HoodieFileFormat.PARQUET.getFileExtension()); - try (AvroParquetWriter parquetWriter = new AvroParquetWriter( - new Path(partitionPath.resolve(fileId).toString()), schema)) { - GenericData.Record record = new GenericData.Record(schema); - record.put("test_timestamp", testTimestampLong * 1000); - record.put("test_long", testTimestampLong * 1000); - record.put("test_date", testDate); - record.put("_hoodie_commit_time", commit); - record.put("_hoodie_commit_seqno", commit + 1); - parquetWriter.write(record); - } - - jobConf.set(IOConstants.COLUMNS, "test_timestamp,test_long,test_date,_hoodie_commit_time,_hoodie_commit_seqno"); - jobConf.set(IOConstants.COLUMNS_TYPES, "timestamp,bigint,date,string,string"); - jobConf.set(READ_COLUMN_NAMES_CONF_STR, "test_timestamp,test_long,test_date,_hoodie_commit_time,_hoodie_commit_seqno"); - InputFormatTestUtil.setupPartition(basePath, partitionPath); - InputFormatTestUtil.commit(basePath, commit); - FileInputFormat.setInputPaths(jobConf, partitionPath.toFile().getPath()); + try { + long testTimestampLong = System.currentTimeMillis(); + int testDate = 19116;// 2022-05-04 + + Schema schema = SchemaTestUtil.getSchemaFromResource(getClass(), "/test_timetype.avsc"); + String commit = "20160628071126"; + HoodieTestUtils.init(HoodieTestUtils.getDefaultStorageConf(), basePath.toString(), + HoodieTableType.COPY_ON_WRITE, HoodieFileFormat.PARQUET); + java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "06", "28")); + String fileId = FSUtils.makeBaseFileName(commit, "1-0-1", "fileid1", + HoodieFileFormat.PARQUET.getFileExtension()); + try (AvroParquetWriter parquetWriter = new AvroParquetWriter( + new Path(partitionPath.resolve(fileId).toString()), schema)) { + GenericData.Record record = new GenericData.Record(schema); + record.put("test_timestamp", testTimestampLong * 1000); + record.put("test_long", testTimestampLong * 1000); + record.put("test_date", testDate); + record.put("_hoodie_commit_time", commit); + record.put("_hoodie_commit_seqno", commit + 1); + parquetWriter.write(record); + } - InputSplit[] splits = inputFormat.getSplits(jobConf, 1); - for (InputSplit split : splits) { - RecordReader recordReader = inputFormat - .getRecordReader(split, jobConf, null); - NullWritable key = recordReader.createKey(); - ArrayWritable writable = recordReader.createValue(); - while (recordReader.next(key, writable)) { - // test timestamp - if (HiveVersionInfo.getShortVersion().startsWith("3")) { - LocalDateTime localDateTime = LocalDateTime.ofInstant( - Instant.ofEpochMilli(testTimestampLong), ZoneOffset.UTC); - assertEquals(Timestamp.valueOf(localDateTime).toString(), String.valueOf(writable.get()[0])); - } else { - Date date = new Date(); + //this is not a hoodie table!! + jobConf.set(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "false"); + jobConf.set(IOConstants.COLUMNS, "test_timestamp,test_long,test_date,_hoodie_commit_time,_hoodie_commit_seqno"); + jobConf.set(IOConstants.COLUMNS_TYPES, "timestamp,bigint,date,string,string"); + jobConf.set(READ_COLUMN_NAMES_CONF_STR, "test_timestamp,test_long,test_date,_hoodie_commit_time,_hoodie_commit_seqno"); + InputFormatTestUtil.setupPartition(basePath, partitionPath); + InputFormatTestUtil.commit(basePath, commit); + FileInputFormat.setInputPaths(jobConf, partitionPath.toFile().getPath()); + + InputSplit[] splits = inputFormat.getSplits(jobConf, 1); + for (InputSplit split : splits) { + RecordReader recordReader = inputFormat + .getRecordReader(split, jobConf, null); + NullWritable key = recordReader.createKey(); + ArrayWritable writable = recordReader.createValue(); + while (recordReader.next(key, writable)) { + // test timestamp + if (HiveVersionInfo.getShortVersion().startsWith("3")) { + LocalDateTime localDateTime = LocalDateTime.ofInstant( + Instant.ofEpochMilli(testTimestampLong), ZoneOffset.UTC); + assertEquals(Timestamp.valueOf(localDateTime).toString(), String.valueOf(writable.get()[0])); + } else { + Date date = new Date(); date.setTime(testTimestampLong); Timestamp actualTime = ((TimestampWritable) writable.get()[0]).getTimestamp(); SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); assertEquals(dateFormat.format(date), dateFormat.format(actualTime)); + } + // test long + assertEquals(testTimestampLong * 1000, ((LongWritable) writable.get()[1]).get()); + // test date + assertEquals(LocalDate.ofEpochDay(testDate).toString(), String.valueOf(writable.get()[2])); } - // test long - assertEquals(testTimestampLong * 1000, ((LongWritable) writable.get()[1]).get()); - // test date - assertEquals(LocalDate.ofEpochDay(testDate).toString(), String.valueOf(writable.get()[2])); } - recordReader.close(); + } finally { + jobConf.set(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "true"); } + } } From 2bc441f13ea6bd7b39ccaa244116edaf9ef75b32 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 13:38:42 -0500 Subject: [PATCH 07/41] fix some failing tests --- .../hudi/hadoop/HiveHoodieReaderContext.java | 8 +++++++- .../hive/TestHoodieCombineHiveInputFormat.java | 14 +++++++++++++- .../TestHoodieMergeOnReadSnapshotReader.java | 2 ++ .../realtime/TestHoodieRealtimeRecordReader.java | 2 ++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 785e09d72c0c1..f3d8d95a4a894 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -113,7 +113,13 @@ public ClosableIterator getFileRecordIterator(Path filePath, long private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchema) { List dataColumnNameList = dataSchema.getFields().stream().map(Schema.Field::name).collect(Collectors.toList()); - List dataColumnTypeList = dataColumnNameList.stream().map(columnTypeMap::get).collect(Collectors.toList()); + List dataColumnTypeList = dataColumnNameList.stream().map(fieldName -> { + TypeInfo type = columnTypeMap.get(fieldName); + if (type == null) { + throw new IllegalArgumentException("Field: " + fieldName + ", does not have a defined type"); + } + return type; + }).collect(Collectors.toList()); jobConf.set(serdeConstants.LIST_COLUMNS, String.join(",", dataColumnNameList)); jobConf.set(serdeConstants.LIST_COLUMN_TYPES, dataColumnTypeList.stream().map(TypeInfo::getQualifiedName).collect(Collectors.joining(","))); //don't replace `f -> f.name()` with lambda reference diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/hive/TestHoodieCombineHiveInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/hive/TestHoodieCombineHiveInputFormat.java index 3371b5efb27be..ab907390f8843 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/hive/TestHoodieCombineHiveInputFormat.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/hive/TestHoodieCombineHiveInputFormat.java @@ -48,6 +48,7 @@ import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileInputFormat; @@ -243,7 +244,18 @@ public void multiLevelPartitionReadersRealtimeCombineHoodieInputFormat() throws HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat(); String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double"; - InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes); + List fields = schema.getFields(); + String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); + String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); + + String hiveColumnNames = fields.stream().map(Schema.Field::name).collect(Collectors.joining(",")); + hiveColumnNames = hiveColumnNames + ",year,month,day"; + String modifiedHiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(tripsHiveColumnTypes); + modifiedHiveColumnTypes = modifiedHiveColumnTypes + ",string,string,string"; + jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames); + jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, modifiedHiveColumnTypes); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); // unset META_TABLE_PARTITION_COLUMNS to trigger HUDI-1718 jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1); diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadSnapshotReader.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadSnapshotReader.java index f982a97106217..463ad5a2ebc15 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadSnapshotReader.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadSnapshotReader.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.HoodieMemoryConfig; +import org.apache.hudi.common.config.HoodieReaderConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -89,6 +90,7 @@ public void setUp() { baseJobConf.set(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.key(), String.valueOf(1024 * 1024)); baseJobConf.set(serdeConstants.LIST_COLUMNS, COLUMNS); baseJobConf.set(serdeConstants.LIST_COLUMN_TYPES, COLUMN_TYPES); + baseJobConf.set(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "false"); storage = new HoodieHadoopStorage(HadoopFSUtils.getFs(new StoragePath(basePath.toUri()), baseJobConf)); } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java index 3ee83a09a3b61..b992987c6909b 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java @@ -22,6 +22,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.config.HoodieMemoryConfig; +import org.apache.hudi.common.config.HoodieReaderConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieLogFile; @@ -120,6 +121,7 @@ public void setUp() { storageConf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); baseJobConf = new JobConf(storageConf.unwrap()); baseJobConf.set(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.key(), String.valueOf(1024 * 1024)); + baseJobConf.set(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "false"); fs = HadoopFSUtils.getFs(basePath.toUri().toString(), baseJobConf); storage = new HoodieHadoopStorage(fs); } From 1bcbab899d72e76a16ab82516d8183a9b537270f Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 13:43:18 -0500 Subject: [PATCH 08/41] assigned the ports backwards --- .../docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml index edc1a1249aa02..d654e0dcf46d9 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml @@ -133,7 +133,7 @@ services: ports: - "10000:10000" # JVM debugging port (will be mapped to a random port on host) - - "5005:64757" + - "64757:5005" depends_on: - "hivemetastore" links: From 6e5fadd5cf9985082facbf38553538aeada5ba00 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 14:06:44 -0500 Subject: [PATCH 09/41] verbose output bundle validation --- packaging/bundle-validation/validate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/bundle-validation/validate.sh b/packaging/bundle-validation/validate.sh index d81f3771f0bf5..d500ea794bd21 100755 --- a/packaging/bundle-validation/validate.sh +++ b/packaging/bundle-validation/validate.sh @@ -97,7 +97,7 @@ test_spark_hadoop_mr_bundles () { # save HiveQL query results hiveqlresultsdir=/tmp/hadoop-mr-bundle/hiveql/trips/results mkdir -p $hiveqlresultsdir - $HIVE_HOME/bin/beeline --hiveconf hive.input.format=org.apache.hudi.hadoop.HoodieParquetInputFormat \ + $HIVE_HOME/bin/beeline --verbose --hiveconf hive.input.format=org.apache.hudi.hadoop.HoodieParquetInputFormat \ -u jdbc:hive2://localhost:10000/default --showHeader=false --outputformat=csv2 \ -e 'select * from trips' >> $hiveqlresultsdir/results.csv numRecordsHiveQL=$(cat $hiveqlresultsdir/*.csv | wc -l) From dc4ac6f5b52e3f265bc58ec9e477c2401485582a Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 14:59:17 -0500 Subject: [PATCH 10/41] add volume to docker compose --- .../docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml index d654e0dcf46d9..e2a95c09b50d6 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml @@ -142,6 +142,7 @@ services: - "namenode" volumes: - ${HUDI_WS}:/var/hoodie/ws + - /Users/jon/Desktop/hiveWorkload:/var/hiveWorkload sparkmaster: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:linux-arm64-0.10.1 @@ -228,6 +229,7 @@ services: - "namenode" volumes: - ${HUDI_WS}:/var/hoodie/ws + - /Users/jon/Desktop/hiveWorkload:/var/hiveWorkload adhoc-2: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1 @@ -250,6 +252,7 @@ services: - "namenode" volumes: - ${HUDI_WS}:/var/hoodie/ws + - /Users/jon/Desktop/hiveWorkload:/var/hiveWorkload volumes: namenode: From 8669cce7fd795f8ababda340a04f0a517d29693c Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 16:49:47 -0500 Subject: [PATCH 11/41] need to lowercase the field names omg --- .../java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index f3d8d95a4a894..306d0c2808f13 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -54,6 +54,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.function.UnaryOperator; import java.util.stream.Collectors; @@ -112,7 +113,7 @@ public ClosableIterator getFileRecordIterator(Path filePath, long } private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchema) { - List dataColumnNameList = dataSchema.getFields().stream().map(Schema.Field::name).collect(Collectors.toList()); + List dataColumnNameList = dataSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).collect(Collectors.toList()); List dataColumnTypeList = dataColumnNameList.stream().map(fieldName -> { TypeInfo type = columnTypeMap.get(fieldName); if (type == null) { From fdeea22de1ecd03ca69780e02e47df265a26cd37 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 17:00:33 -0500 Subject: [PATCH 12/41] don't remove partition if it's listed in the schema --- .../apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index e718ef20564b6..196f410da0719 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -57,6 +57,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.function.UnaryOperator; @@ -232,6 +233,8 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) } else { partitionColumns = Arrays.stream(partitionColString.split(",")).collect(Collectors.toSet()); } + + tableSchema.getFields().forEach(f -> partitionColumns.remove(f.name().toLowerCase(Locale.ROOT))); return HoodieAvroUtils.generateProjectionSchema(tableSchema, Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); } From 60fe6faba8cde4489fcc532c55815c482b159a0a Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 17:57:07 -0500 Subject: [PATCH 13/41] put partition cols at end of output --- .../HoodieFileGroupReaderRecordReader.java | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index 196f410da0719..041e072896564 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -56,12 +56,14 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.function.UnaryOperator; import java.util.stream.Collectors; +import java.util.stream.Stream; public class HoodieFileGroupReaderRecordReader implements RecordReader { @@ -86,6 +88,7 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, final JobConf jobConf, final Reporter reporter) throws IOException { HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); + Set partitionColumns = new HashSet<>(getPartitionFieldNames(jobConf)); this.inputSplit = split; this.jobConf = jobConf; FileSplit fileSplit = (FileSplit) split; @@ -104,7 +107,10 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), fileSplit.getLength(), false); this.fileGroupReader.initRecordIterators(); - this.reverseProjection = readerContext.reverseProjectRecord(requestedSchema, tableSchema); + Schema outputSchema = HoodieAvroUtils.generateProjectionSchema(tableSchema, + Stream.concat(tableSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).filter(partitionColumns::contains), + partitionColumns.stream()).collect(Collectors.toList())); + this.reverseProjection = readerContext.reverseProjectRecord(requestedSchema, outputSchema); } @Override @@ -150,17 +156,19 @@ public JobConf getJobConf() { return jobConf; } + private static List getPartitionFieldNames(JobConf jobConf) { + String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); + return partitionFields.length() > 0 ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()) + : new ArrayList<>(); + } + private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, JobConf jobConf) { TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); try { Option schemaOpt = tableSchemaResolver.getTableAvroSchemaFromLatestCommit(true); if (schemaOpt.isPresent()) { // Add partitioning fields to writer schema for resulting row to contain null values for these fields - String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); - List partitioningFields = - partitionFields.length() > 0 ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()) - : new ArrayList<>(); - return HoodieRealtimeRecordReaderUtils.addPartitionFields(schemaOpt.get(), partitioningFields); + return HoodieRealtimeRecordReaderUtils.addPartitionFields(schemaOpt.get(), getPartitionFieldNames(jobConf)); } throw new RuntimeException("Unable to get table schema"); } catch (Exception e) { @@ -233,7 +241,7 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) } else { partitionColumns = Arrays.stream(partitionColString.split(",")).collect(Collectors.toSet()); } - + //if they are actually written to the file, then it is ok to read them from the file tableSchema.getFields().forEach(f -> partitionColumns.remove(f.name().toLowerCase(Locale.ROOT))); return HoodieAvroUtils.generateProjectionSchema(tableSchema, Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); From ad3f3d3fa7998538553e106ac2cfe29b9db20ad2 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 28 Dec 2023 18:00:38 -0500 Subject: [PATCH 14/41] invert filter --- .../apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index 041e072896564..2d7c6567c6df6 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -108,7 +108,7 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, fileSplit.getLength(), false); this.fileGroupReader.initRecordIterators(); Schema outputSchema = HoodieAvroUtils.generateProjectionSchema(tableSchema, - Stream.concat(tableSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).filter(partitionColumns::contains), + Stream.concat(tableSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).filter(n -> !partitionColumns.contains(n)), partitionColumns.stream()).collect(Collectors.toList())); this.reverseProjection = readerContext.reverseProjectRecord(requestedSchema, outputSchema); } From 6a7748026a6144739ce64e69f4d6698e4e69b026 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Fri, 29 Dec 2023 13:56:58 -0500 Subject: [PATCH 15/41] support no base file, only log. support read from non-hudi table --- .../hudi/hadoop/HiveHoodieReaderContext.java | 2 +- .../HoodieFileGroupReaderRecordReader.java | 18 +++++++++++++++--- .../hudi/hadoop/HoodieParquetInputFormat.java | 4 ++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 306d0c2808f13..399ff3c8c49c2 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -154,7 +154,7 @@ public Object getValue(ArrayWritable record, Schema schema, String fieldName) { @Override public HoodieRecord constructHoodieRecord(Option recordOption, Map metadataMap) { if (!recordOption.isPresent()) { - return new HoodieEmptyRecord<>(new HoodieKey((String) metadataMap.get(INTERNAL_META_RECORD_KEY), (String) metadataMap.get(INTERNAL_META_PARTITION_PATH)), HoodieRecord.HoodieRecordType.AVRO); + return new HoodieEmptyRecord<>(new HoodieKey((String) metadataMap.get(INTERNAL_META_RECORD_KEY), (String) metadataMap.get(INTERNAL_META_PARTITION_PATH)), HoodieRecord.HoodieRecordType.HIVE); } Schema schema = (Schema) metadataMap.get(INTERNAL_META_SCHEMA); ArrayWritable writable = recordOption.get(); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index 2d7c6567c6df6..d8a11c116e8b4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -208,9 +208,21 @@ private static FileSlice getFileSliceFromSplit(FileSplit split, Map getRecordReader(final InputSpli if (HoodieFileGroupReaderRecordReader.useFilegroupReader(job)) { try { + if (!(split instanceof FileSplit) || !TablePathUtils.isHoodieTablePath(((FileSplit) split).getPath(), job)) { + return super.getRecordReader(split, job, reporter); + } if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { return new HoodieFileGroupReaderRecordReader((s, j, r) -> { try { From 51f91c7463adba9e2bd3a6a97201b8392d596b84 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Fri, 29 Dec 2023 14:17:12 -0500 Subject: [PATCH 16/41] disable for skip merge as well --- .../apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index d8a11c116e8b4..a9641d1ae817c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -33,6 +33,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.TablePathUtils; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader; import org.apache.hudi.hadoop.realtime.RealtimeSplit; import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; @@ -261,6 +262,7 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) public static boolean useFilegroupReader(final JobConf jobConf) { return jobConf.getBoolean(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), HoodieReaderConfig.FILE_GROUP_READER_ENABLED.defaultValue()) - && !jobConf.getBoolean(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue()); + && !jobConf.getBoolean(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue()) + && !jobConf.getBoolean(HoodieRealtimeRecordReader.REALTIME_SKIP_MERGE_PROP, false); } } From 1da1e763eee33107812b7723f838ce633d827422 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Fri, 29 Dec 2023 15:04:58 -0500 Subject: [PATCH 17/41] fix non hoodie path read --- .../src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java | 3 ++- .../java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java index 2ab77cc98be92..7efcd5fea75a1 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java @@ -117,7 +117,8 @@ public HoodieRecord newInstance(HoodieKey key) { public Comparable getOrderingValue(Schema recordSchema, Properties props) { String orderingField = ConfigUtils.getOrderingField(props); if (orderingField == null) { - throw new IllegalArgumentException("Ordering Field is not set. Precombine must be set. (If you are using a custom record merger it might be something else)"); + return 0; + //throw new IllegalArgumentException("Ordering Field is not set. Precombine must be set. (If you are using a custom record merger it might be something else)"); } return (Comparable) getValue(ConfigUtils.getOrderingField(props)); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 521a477cff2a0..7de9d85ed57b5 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -97,7 +97,7 @@ public RecordReader getRecordReader(final InputSpli if (HoodieFileGroupReaderRecordReader.useFilegroupReader(job)) { try { - if (!(split instanceof FileSplit) || !TablePathUtils.isHoodieTablePath(((FileSplit) split).getPath(), job)) { + if (!(split instanceof FileSplit) || !TablePathUtils.getTablePath(((FileSplit) split).getPath(), job).isPresent()) { return super.getRecordReader(split, job, reporter); } if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { From 2b1f172e06686f3692419969e4f5aca89a5a1710 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Tue, 2 Jan 2024 08:48:07 -0500 Subject: [PATCH 18/41] revert setting precombine --- .../org/apache/hudi/common/testutils/HoodieTestUtils.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index 102ee61ac2c33..4d4fc57cc09cd 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -29,7 +29,6 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.storage.HoodieStorage; @@ -182,16 +181,11 @@ public static HoodieTableMetaClient init(StorageConfiguration storageConf, St public static HoodieTableMetaClient init(StorageConfiguration storageConf, String basePath, HoodieTableType tableType, Properties properties, String databaseName) throws IOException { - String preCombineField = ConfigUtils.getOrderingField(properties); - if (preCombineField == null && tableType.equals(HoodieTableType.MERGE_ON_READ)) { - preCombineField = "timestamp"; - } HoodieTableMetaClient.PropertyBuilder builder = HoodieTableMetaClient.withPropertyBuilder() .setDatabaseName(databaseName) .setTableName(RAW_TRIPS_TEST_NAME) .setTableType(tableType) - .setPreCombineField(preCombineField) .setPayloadClass(HoodieAvroPayload.class); String keyGen = properties.getProperty("hoodie.datasource.write.keygenerator.class"); From 7b45027f0ccb1b558fe982038ae209b923d7babe Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Tue, 2 Jan 2024 10:04:52 -0500 Subject: [PATCH 19/41] fix no meta cols table --- .../table/TestHoodieMergeOnReadTable.java | 2 +- .../HoodieFileGroupReaderRecordReader.java | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java index b0876d0610370..ae81a310190cb 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java @@ -213,7 +213,7 @@ public void testUpsertPartitioner(boolean populateMetaFields) throws Exception { .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) .collect(Collectors.toList()); List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(storageConf(), inputPaths, - basePath(), new JobConf(storageConf().unwrap()), true, false); + basePath(), new JobConf(storageConf().unwrap()), true, populateMetaFields); // Wrote 20 records in 2 batches assertEquals(40, recordsRead.size(), "Must contain 40 records"); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index a9641d1ae817c..d531ac0c1d20b 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -98,13 +98,14 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, .setConf(jobConf) .setBasePath(tableBasePath) .build(); - Schema tableSchema = getLatestTableSchema(metaClient, jobConf); + String latestCommitTime = getLatestCommitTime(split, metaClient); + Schema tableSchema = getLatestTableSchema(metaClient, jobConf, latestCommitTime); Schema requestedSchema = createRequestedSchema(tableSchema, jobConf); Map hosts = new HashMap<>(); this.readerContext = new HiveHoodieReaderContext(readerCreator, split, jobConf, reporter, tableSchema, hosts, metaClient); this.arrayWritable = new ArrayWritable(Writable.class, new Writable[requestedSchema.getFields().size()]); this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, jobConf, tableBasePath, - getLatestCommitTime(split, metaClient), getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(tableBasePath, jobConf), tableBasePath), + latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(tableBasePath, jobConf), tableBasePath), tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), fileSplit.getLength(), false); this.fileGroupReader.initRecordIterators(); @@ -163,15 +164,12 @@ private static List getPartitionFieldNames(JobConf jobConf) { : new ArrayList<>(); } - private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, JobConf jobConf) { + private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, JobConf jobConf, String latestCommitTime) { TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); try { - Option schemaOpt = tableSchemaResolver.getTableAvroSchemaFromLatestCommit(true); - if (schemaOpt.isPresent()) { - // Add partitioning fields to writer schema for resulting row to contain null values for these fields - return HoodieRealtimeRecordReaderUtils.addPartitionFields(schemaOpt.get(), getPartitionFieldNames(jobConf)); - } - throw new RuntimeException("Unable to get table schema"); + Schema schema = tableSchemaResolver.getTableAvroSchema(latestCommitTime); + // Add partitioning fields to writer schema for resulting row to contain null values for these fields + return HoodieRealtimeRecordReaderUtils.addPartitionFields(schema, getPartitionFieldNames(jobConf)); } catch (Exception e) { throw new RuntimeException("Unable to get table schema", e); } @@ -260,6 +258,9 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); } + /** + * `schema.on.read` and skip merge not implemented + */ public static boolean useFilegroupReader(final JobConf jobConf) { return jobConf.getBoolean(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), HoodieReaderConfig.FILE_GROUP_READER_ENABLED.defaultValue()) && !jobConf.getBoolean(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue()) From 16cff402481bc10edd2c27b3af446af22eafb842 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Tue, 2 Jan 2024 11:22:04 -0500 Subject: [PATCH 20/41] check if no requested fields --- .../hudi/hadoop/HoodieFileGroupReaderRecordReader.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index d531ac0c1d20b..f860522165ba0 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -32,6 +32,7 @@ import org.apache.hudi.common.table.read.HoodieFileGroupReader; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.TablePathUtils; import org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader; import org.apache.hudi.hadoop.realtime.RealtimeSplit; @@ -244,6 +245,11 @@ private static BaseFile createBootstrapBaseFile(FileSplit split, Map partitionColumns; From e115fd57c4f2b33ff827c97fe73aa48e181260d6 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Tue, 2 Jan 2024 11:26:21 -0500 Subject: [PATCH 21/41] create empty schema properly --- .../apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index f860522165ba0..1c85145fd76e9 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -247,8 +247,10 @@ private static BaseFile createBootstrapBaseFile(FileSplit split, Map Date: Tue, 2 Jan 2024 11:41:46 -0500 Subject: [PATCH 22/41] check if metadata folder exists --- .../hudi/hadoop/HoodieParquetInputFormat.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 7de9d85ed57b5..45be9e43bcbb4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -19,6 +19,8 @@ package org.apache.hudi.hadoop; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.TablePathUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; @@ -26,6 +28,7 @@ import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; @@ -91,13 +94,25 @@ private void initAvroInputFormat() { } } + private static boolean checkTableIsHudi(final InputSplit split, final JobConf job) { + try { + Option tablePathOpt = TablePathUtils.getTablePath(((FileSplit) split).getPath(), job); + if (!tablePathOpt.isPresent()) { + return false; + } + return tablePathOpt.get().getFileSystem(job).exists(new Path(tablePathOpt.get(), HoodieTableMetaClient.METAFOLDER_NAME)); + } catch (IOException e) { + return false; + } + } + @Override public RecordReader getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { if (HoodieFileGroupReaderRecordReader.useFilegroupReader(job)) { try { - if (!(split instanceof FileSplit) || !TablePathUtils.getTablePath(((FileSplit) split).getPath(), job).isPresent()) { + if (!(split instanceof FileSplit) || !checkTableIsHudi(split, job)) { return super.getRecordReader(split, job, reporter); } if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { From 73bedb4276133a087e2e818e057cae5a3704e770 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Tue, 2 Jan 2024 12:16:48 -0500 Subject: [PATCH 23/41] handle mor with no meta fields --- .../hudi/hadoop/HiveHoodieReaderContext.java | 16 ++++++++++++++++ .../hudi/hadoop/utils/ObjectInspectorCache.java | 9 +++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 399ff3c8c49c2..b9b50229e2771 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.CloseableMappingIterator; import org.apache.hudi.exception.HoodieException; @@ -72,6 +73,7 @@ public class HiveHoodieReaderContext extends HoodieReaderContext private final ObjectInspectorCache objectInspectorCache; private RecordReader firstRecordReader = null; + private final String recordKeyField; protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator, InputSplit split, JobConf jobConf, @@ -86,10 +88,19 @@ protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCr this.writerSchema = writerSchema; this.hosts = hosts; String tableName = metaClient.getTableConfig().getTableName(); + recordKeyField = metaClient.getTableConfig().populateMetaFields() + ? HoodieRecord.RECORD_KEY_METADATA_FIELD + : assertSingleKey(metaClient.getTableConfig().getRecordKeyFields()); this.objectInspectorCache = HoodieArrayWritableAvroUtils.getCacheForTable(tableName, writerSchema, jobConf); this.columnTypeMap = objectInspectorCache.getColumnTypeMap(); } + private static String assertSingleKey(Option recordKeyFieldsOpt) { + ValidationUtils.checkArgument(recordKeyFieldsOpt.isPresent(), "no record key field"); + ValidationUtils.checkArgument(recordKeyFieldsOpt.get().length == 1, "more than 1 record key, and not meta fields"); + return recordKeyFieldsOpt.get()[0]; + } + @Override public FileSystem getFs(String path, Configuration conf) { return FSUtils.getFs(path, conf); @@ -146,6 +157,11 @@ public HoodieRecordMerger getRecordMerger(String mergerStrategy) { } } + @Override + public String getRecordKey(ArrayWritable record, Schema schema) { + return getValue(record, schema, recordKeyField).toString(); + } + @Override public Object getValue(ArrayWritable record, Schema schema, String fieldName) { return objectInspectorCache.getValue(record, schema, fieldName); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java index 47564ae8caa0e..e5e1f125e6f7d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java @@ -88,7 +88,12 @@ public ArrayWritableObjectInspector getObjectInspector(Schema schema) { } public Object getValue(ArrayWritable record, Schema schema, String fieldName) { - ArrayWritableObjectInspector objectInspector = getObjectInspector(schema); - return objectInspector.getStructFieldData(record, objectInspector.getStructFieldRef(fieldName)); + try { + ArrayWritableObjectInspector objectInspector = getObjectInspector(schema); + return objectInspector.getStructFieldData(record, objectInspector.getStructFieldRef(fieldName)); + } catch (Exception e) { + throw e; + } + } } From 3f412a106bc308172680e80aa63d2e1e2ee3af4d Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Tue, 2 Jan 2024 13:09:33 -0500 Subject: [PATCH 24/41] disable reader for a test because mor seems to work different --- ...HoodieSparkMergeOnReadTableCompaction.java | 80 ++++++++++--------- .../hudi/hadoop/HiveHoodieReaderContext.java | 3 +- 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java index e2ba56f94a350..ef28980d9cf95 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java @@ -22,6 +22,7 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.HoodieReaderConfig; import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.model.DefaultHoodieRecordPayload; import org.apache.hudi.common.model.HoodieKey; @@ -146,43 +147,50 @@ public void testWriteDuringCompaction(String payloadClass) throws IOException { @ParameterizedTest @MethodSource("writeLogTest") public void testWriteLogDuringCompaction(boolean enableMetadataTable, boolean enableTimelineServer) throws IOException { - Properties props = getPropertiesForKeyGen(true); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder() - .forTable("test-trip-table") - .withPath(basePath()) - .withSchema(TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2) - .withAutoCommit(true) - .withEmbeddedTimelineServerEnabled(enableTimelineServer) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withMaxNumDeltaCommitsBeforeCompaction(1).build()) - .withLayoutConfig(HoodieLayoutConfig.newBuilder() - .withLayoutType(HoodieStorageLayout.LayoutType.BUCKET.name()) - .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build()) - .withIndexConfig(HoodieIndexConfig.newBuilder().fromProperties(props).withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("1").build()) - .build(); - props.putAll(config.getProps()); - - metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props); - client = getHoodieWriteClient(config); - - final List records = dataGen.generateInserts("001", 100); - JavaRDD writeRecords = jsc().parallelize(records, 2); + try { + //disable for this test because it seems like we process mor in a different order? + jsc().hadoopConfiguration().set(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "false"); + Properties props = getPropertiesForKeyGen(true); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .forTable("test-trip-table") + .withPath(basePath()) + .withSchema(TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2) + .withAutoCommit(true) + .withEmbeddedTimelineServerEnabled(enableTimelineServer) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withLayoutConfig(HoodieLayoutConfig.newBuilder() + .withLayoutType(HoodieStorageLayout.LayoutType.BUCKET.name()) + .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().fromProperties(props).withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("1").build()) + .build(); + props.putAll(config.getProps()); + + metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props); + client = getHoodieWriteClient(config); + + final List records = dataGen.generateInserts("001", 100); + JavaRDD writeRecords = jsc().parallelize(records, 2); + + // initialize 100 records + client.upsert(writeRecords, client.startCommit()); + // update 100 records + client.upsert(writeRecords, client.startCommit()); + // schedule compaction + client.scheduleCompaction(Option.empty()); + // delete 50 records + List toBeDeleted = records.stream().map(HoodieRecord::getKey).limit(50).collect(Collectors.toList()); + JavaRDD deleteRecords = jsc().parallelize(toBeDeleted, 2); + client.delete(deleteRecords, client.startCommit()); + // insert the same 100 records again + client.upsert(writeRecords, client.startCommit()); + Assertions.assertEquals(100, readTableTotalRecordsNum()); + } finally { + jsc().hadoopConfiguration().set(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "true"); + } - // initialize 100 records - client.upsert(writeRecords, client.startCommit()); - // update 100 records - client.upsert(writeRecords, client.startCommit()); - // schedule compaction - client.scheduleCompaction(Option.empty()); - // delete 50 records - List toBeDeleted = records.stream().map(HoodieRecord::getKey).limit(50).collect(Collectors.toList()); - JavaRDD deleteRecords = jsc().parallelize(toBeDeleted, 2); - client.delete(deleteRecords, client.startCommit()); - // insert the same 100 records again - client.upsert(writeRecords, client.startCommit()); - Assertions.assertEquals(100, readTableTotalRecordsNum()); } private long readTableTotalRecordsNum() { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index b9b50229e2771..c4a0cdeb8a42d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.CloseableMappingIterator; @@ -164,7 +165,7 @@ public String getRecordKey(ArrayWritable record, Schema schema) { @Override public Object getValue(ArrayWritable record, Schema schema, String fieldName) { - return objectInspectorCache.getValue(record, schema, fieldName); + return StringUtils.isNullOrEmpty(fieldName) ? null : objectInspectorCache.getValue(record, schema, fieldName); } @Override From 51a47e4f2ddd2563723ce0f28bc9eb308d278dff Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 4 Jan 2024 15:42:48 -0500 Subject: [PATCH 25/41] delete partition column from the jobconf if it is written in the file --- .../HoodieFileGroupReaderRecordReader.java | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index 1c85145fd76e9..b231d8a889c73 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -82,31 +82,32 @@ org.apache.hadoop.mapred.RecordReader getRecordRead private final ArrayWritable arrayWritable; private final NullWritable nullWritable = NullWritable.get(); private final InputSplit inputSplit; - private final JobConf jobConf; + private final JobConf jobConfCopy; private final UnaryOperator reverseProjection; public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, final InputSplit split, final JobConf jobConf, final Reporter reporter) throws IOException { - HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); - Set partitionColumns = new HashSet<>(getPartitionFieldNames(jobConf)); + this.jobConfCopy = new JobConf(jobConf); + HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConfCopy); + Set partitionColumns = new HashSet<>(getPartitionFieldNames(jobConfCopy)); this.inputSplit = split; - this.jobConf = jobConf; + FileSplit fileSplit = (FileSplit) split; - String tableBasePath = getTableBasePath(split, jobConf); + String tableBasePath = getTableBasePath(split, jobConfCopy); HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() - .setConf(jobConf) + .setConf(jobConfCopy) .setBasePath(tableBasePath) .build(); String latestCommitTime = getLatestCommitTime(split, metaClient); - Schema tableSchema = getLatestTableSchema(metaClient, jobConf, latestCommitTime); - Schema requestedSchema = createRequestedSchema(tableSchema, jobConf); + Schema tableSchema = getLatestTableSchema(metaClient, jobConfCopy, latestCommitTime); + Schema requestedSchema = createRequestedSchema(tableSchema, jobConfCopy); Map hosts = new HashMap<>(); - this.readerContext = new HiveHoodieReaderContext(readerCreator, split, jobConf, reporter, tableSchema, hosts, metaClient); + this.readerContext = new HiveHoodieReaderContext(readerCreator, split, jobConfCopy, reporter, tableSchema, hosts, metaClient); this.arrayWritable = new ArrayWritable(Writable.class, new Writable[requestedSchema.getFields().size()]); - this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, jobConf, tableBasePath, - latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(tableBasePath, jobConf), tableBasePath), + this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, jobConfCopy, tableBasePath, + latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(tableBasePath, jobConfCopy), tableBasePath), tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), fileSplit.getLength(), false); this.fileGroupReader.initRecordIterators(); @@ -156,7 +157,7 @@ public RealtimeSplit getSplit() { } public JobConf getJobConf() { - return jobConf; + return jobConfCopy; } private static List getPartitionFieldNames(JobConf jobConf) { @@ -262,6 +263,9 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) } //if they are actually written to the file, then it is ok to read them from the file tableSchema.getFields().forEach(f -> partitionColumns.remove(f.name().toLowerCase(Locale.ROOT))); + //need to filter those partitions because they will be added back later on. And we don't want that + jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, + Arrays.stream(jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS).split(",")).filter(partitionColumns::contains).collect(Collectors.joining(","))); return HoodieAvroUtils.generateProjectionSchema(tableSchema, Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); } From a8020bdecea744c18936847018115c7d3963b80c Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 4 Jan 2024 16:28:11 -0500 Subject: [PATCH 26/41] modify data schema due to partition column madness --- .../hudi/hadoop/HiveHoodieReaderContext.java | 20 ++++++++++++++++--- .../HoodieFileGroupReaderRecordReader.java | 5 +---- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index c4a0cdeb8a42d..c91f6713a4123 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -19,6 +19,8 @@ package org.apache.hudi.hadoop; +import org.apache.hudi.avro.AvroSchemaUtils; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.engine.HoodieReaderContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieEmptyRecord; @@ -55,11 +57,14 @@ import java.io.IOException; import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.function.UnaryOperator; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.apache.hudi.common.model.HoodieRecordMerger.DEFAULT_MERGER_STRATEGY_UUID; @@ -74,6 +79,9 @@ public class HiveHoodieReaderContext extends HoodieReaderContext private final ObjectInspectorCache objectInspectorCache; private RecordReader firstRecordReader = null; + private final List partitionCols; + private final Set partitionColSet; + private final String recordKeyField; protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator, InputSplit split, @@ -88,6 +96,9 @@ protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCr this.reporter = reporter; this.writerSchema = writerSchema; this.hosts = hosts; + this.partitionCols = HoodieFileGroupReaderRecordReader.getPartitionFieldNames(jobConf).stream() + .filter(n -> writerSchema.getField(n) != null).collect(Collectors.toList()); + this.partitionColSet = new HashSet<>(this.partitionCols); String tableName = metaClient.getTableConfig().getTableName(); recordKeyField = metaClient.getTableConfig().populateMetaFields() ? HoodieRecord.RECORD_KEY_METADATA_FIELD @@ -110,18 +121,21 @@ public FileSystem getFs(String path, Configuration conf) { @Override public ClosableIterator getFileRecordIterator(Path filePath, long start, long length, Schema dataSchema, Schema requiredSchema, Configuration conf) throws IOException { JobConf jobConfCopy = new JobConf(jobConf); - setSchemas(jobConfCopy, dataSchema, requiredSchema); + Schema modifiedDataSchema = HoodieAvroUtils.generateProjectionSchema(dataSchema, Stream.concat(dataSchema.getFields().stream() + .map(f -> f.name().toLowerCase(Locale.ROOT)).filter(n -> !partitionColSet.contains(n)), + partitionCols.stream()).collect(Collectors.toList())); + setSchemas(jobConfCopy, modifiedDataSchema, requiredSchema); InputSplit inputSplit = new FileSplit(filePath, start, length, hosts.get(filePath.toString())); RecordReader recordReader = readerCreator.getRecordReader(inputSplit, jobConfCopy, reporter); if (firstRecordReader == null) { firstRecordReader = recordReader; } ClosableIterator recordIterator = new RecordReaderValueIterator<>(recordReader); - if (dataSchema.equals(requiredSchema)) { + if (modifiedDataSchema.equals(requiredSchema)) { return recordIterator; } //The record reader puts the required columns in the positions of the data schema and nulls the rest of the columns - return new CloseableMappingIterator<>(recordIterator, projectRecord(dataSchema, requiredSchema)); + return new CloseableMappingIterator<>(recordIterator, projectRecord(modifiedDataSchema, requiredSchema)); } private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchema) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index b231d8a889c73..f43d97dab3f3a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -160,7 +160,7 @@ public JobConf getJobConf() { return jobConfCopy; } - private static List getPartitionFieldNames(JobConf jobConf) { + public static List getPartitionFieldNames(JobConf jobConf) { String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); return partitionFields.length() > 0 ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()) : new ArrayList<>(); @@ -263,9 +263,6 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) } //if they are actually written to the file, then it is ok to read them from the file tableSchema.getFields().forEach(f -> partitionColumns.remove(f.name().toLowerCase(Locale.ROOT))); - //need to filter those partitions because they will be added back later on. And we don't want that - jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, - Arrays.stream(jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS).split(",")).filter(partitionColumns::contains).collect(Collectors.joining(","))); return HoodieAvroUtils.generateProjectionSchema(tableSchema, Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); } From 25078f3a2119dc32a767d011a15b952b6710f767 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 4 Jan 2024 16:29:05 -0500 Subject: [PATCH 27/41] remove unused import --- .../java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index c91f6713a4123..e00b1d24570c5 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -19,7 +19,6 @@ package org.apache.hudi.hadoop; -import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.engine.HoodieReaderContext; import org.apache.hudi.common.fs.FSUtils; From c225f52b15ea339ec7bf86ed485493ab3110eccd Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 4 Jan 2024 17:36:20 -0500 Subject: [PATCH 28/41] add some comments --- .../java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java | 1 + .../apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java | 1 + 2 files changed, 2 insertions(+) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index e00b1d24570c5..fc9c7bd12c942 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -120,6 +120,7 @@ public FileSystem getFs(String path, Configuration conf) { @Override public ClosableIterator getFileRecordIterator(Path filePath, long start, long length, Schema dataSchema, Schema requiredSchema, Configuration conf) throws IOException { JobConf jobConfCopy = new JobConf(jobConf); + //move the partition cols to the end, because in some cases it has issues if we don't do that Schema modifiedDataSchema = HoodieAvroUtils.generateProjectionSchema(dataSchema, Stream.concat(dataSchema.getFields().stream() .map(f -> f.name().toLowerCase(Locale.ROOT)).filter(n -> !partitionColSet.contains(n)), partitionCols.stream()).collect(Collectors.toList())); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index f43d97dab3f3a..230e5257d7e28 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -111,6 +111,7 @@ latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(ta tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), fileSplit.getLength(), false); this.fileGroupReader.initRecordIterators(); + //it expects the partition columns to be at the end Schema outputSchema = HoodieAvroUtils.generateProjectionSchema(tableSchema, Stream.concat(tableSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).filter(n -> !partitionColumns.contains(n)), partitionColumns.stream()).collect(Collectors.toList())); From 069a5ad842473b8983684a42eca58f196c29e667 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Thu, 4 Jan 2024 19:41:18 -0500 Subject: [PATCH 29/41] don't add partition fields when the data schema doesn't have them --- .../java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index fc9c7bd12c942..0f5b5f8884ea9 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -123,7 +123,7 @@ public ClosableIterator getFileRecordIterator(Path filePath, long //move the partition cols to the end, because in some cases it has issues if we don't do that Schema modifiedDataSchema = HoodieAvroUtils.generateProjectionSchema(dataSchema, Stream.concat(dataSchema.getFields().stream() .map(f -> f.name().toLowerCase(Locale.ROOT)).filter(n -> !partitionColSet.contains(n)), - partitionCols.stream()).collect(Collectors.toList())); + partitionCols.stream().filter(c -> dataSchema.getField(c) != null)).collect(Collectors.toList())); setSchemas(jobConfCopy, modifiedDataSchema, requiredSchema); InputSplit inputSplit = new FileSplit(filePath, start, length, hosts.get(filePath.toString())); RecordReader recordReader = readerCreator.getRecordReader(inputSplit, jobConfCopy, reporter); From 185217ddb1aaa5fb99b054c4c537553606aaa16b Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Fri, 19 Jan 2024 14:15:39 -0500 Subject: [PATCH 30/41] address review feedback --- .../hudi/hadoop/HoodieParquetInputFormat.java | 4 +- .../AbstractRealtimeRecordReader.java | 4 - .../HoodieCombineRealtimeRecordReader.java | 108 +++++++----------- .../hadoop/utils/ObjectInspectorCache.java | 4 + .../TestHoodieArrayWritableAvroUtils.java | 88 ++++++++++++++ packaging/bundle-validation/validate.sh | 2 +- 6 files changed, 135 insertions(+), 75 deletions(-) create mode 100644 hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 45be9e43bcbb4..31b0dfc087c5a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -94,7 +94,7 @@ private void initAvroInputFormat() { } } - private static boolean checkTableIsHudi(final InputSplit split, final JobConf job) { + private static boolean checkIfHudiTable(final InputSplit split, final JobConf job) { try { Option tablePathOpt = TablePathUtils.getTablePath(((FileSplit) split).getPath(), job); if (!tablePathOpt.isPresent()) { @@ -112,7 +112,7 @@ public RecordReader getRecordReader(final InputSpli if (HoodieFileGroupReaderRecordReader.useFilegroupReader(job)) { try { - if (!(split instanceof FileSplit) || !checkTableIsHudi(split, job)) { + if (!(split instanceof FileSplit) || !checkIfHudiTable(split, job)) { return super.getRecordReader(split, job, reporter); } if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java index 88eef5cc57ead..058ca11a9a07d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java @@ -27,7 +27,6 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.HoodieColumnProjectionUtils; -import org.apache.hudi.hadoop.HoodieFileGroupReaderRecordReader; import org.apache.hudi.hadoop.SchemaEvolutionContext; import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.hadoop.utils.HiveAvroSerializer; @@ -104,9 +103,6 @@ public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) { throw new HoodieException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); } prepareHiveAvroSerializer(); - if (HoodieFileGroupReaderRecordReader.useFilegroupReader(jobConf)) { - throw new IllegalStateException("Should not be here, should be using filegroup reader"); - } } private boolean usesCustomPayload(HoodieTableMetaClient metaClient) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java index b666161d76950..38bf212f3fcc1 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java @@ -22,6 +22,7 @@ import org.apache.hudi.hadoop.HoodieFileGroupReaderRecordReader; import org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.IOContextMap; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; @@ -43,123 +44,94 @@ public class HoodieCombineRealtimeRecordReader implements RecordReader recordReaders = new LinkedList<>(); + private List recordReaders = new LinkedList<>(); // Points to the currently iterating record reader - private HoodieRealtimeRecordReader currentRecordReader; + private RecordReader currentRecordReader; private final boolean useFileGroupReader; - // RecordReaders for each split - private List recordReadersFG = new LinkedList<>(); - // Points to the currently iterating record reader - private HoodieFileGroupReaderRecordReader currentRecordReaderFG; - public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split, List readers) { useFileGroupReader = HoodieFileGroupReaderRecordReader.useFilegroupReader(jobConf); - if (useFileGroupReader) { - try { - ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers - .size(), "Num Splits does not match number of unique RecordReaders!"); - for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { - LOG.info("Creating new RealtimeRecordReader for split"); + try { + ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers + .size(), "Num Splits does not match number of unique RecordReaders!"); + for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { + if (useFileGroupReader) { + LOG.info("Creating new HoodieFileGroupReaderRecordReader for split"); RecordReader reader = readers.remove(0); ValidationUtils.checkArgument(reader instanceof HoodieFileGroupReaderRecordReader, reader.toString() + "not instance of HoodieFileGroupReaderRecordReader "); - recordReadersFG.add((HoodieFileGroupReaderRecordReader) reader); - } - currentRecordReaderFG = recordReadersFG.remove(0); - } catch (Exception e) { - throw new RuntimeException(e); - } - } else { - try { - ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers - .size(), "Num Splits does not match number of unique RecordReaders!"); - for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { + recordReaders.add(reader); + } else { LOG.info("Creating new RealtimeRecordReader for split"); recordReaders.add( new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) rtSplit, jobConf, readers.remove(0))); } currentRecordReader = recordReaders.remove(0); - } catch (Exception e) { - throw new RuntimeException(e); } + + } catch (Exception e) { + throw new RuntimeException(e); } } @Override public boolean next(NullWritable key, ArrayWritable value) throws IOException { - if (useFileGroupReader) { - if (this.currentRecordReaderFG.next(key, value)) { - return true; - } else if (recordReadersFG.size() > 0) { - this.currentRecordReaderFG.close(); - this.currentRecordReaderFG = recordReadersFG.remove(0); - HoodieFileGroupReaderRecordReader reader = currentRecordReaderFG; - // when switch reader, ioctx should be updated - IOContextMap.get(reader.getJobConf()).setInputPath(reader.getSplit().getPath()); - return next(key, value); + if (this.currentRecordReader.next(key, value)) { + return true; + } else if (recordReaders.size() > 0) { + this.currentRecordReader.close(); + this.currentRecordReader = recordReaders.remove(0); + RecordReader reader; + JobConf jobConf; + Path path; + if (useFileGroupReader) { + reader = currentRecordReader; + jobConf = ((HoodieFileGroupReaderRecordReader) reader).getJobConf(); + path = ((HoodieFileGroupReaderRecordReader) reader).getSplit().getPath(); } else { - return false; + reader = ((HoodieRealtimeRecordReader)currentRecordReader).getReader(); + jobConf = ((AbstractRealtimeRecordReader) reader).getJobConf(); + path = ((AbstractRealtimeRecordReader) reader).getSplit().getPath(); } + // when switch reader, ioctx should be updated + IOContextMap.get(jobConf).setInputPath(path); + return next(key, value); } else { - if (this.currentRecordReader.next(key, value)) { - return true; - } else if (recordReaders.size() > 0) { - this.currentRecordReader.close(); - this.currentRecordReader = recordReaders.remove(0); - AbstractRealtimeRecordReader reader = (AbstractRealtimeRecordReader)currentRecordReader.getReader(); - // when switch reader, ioctx should be updated - IOContextMap.get(reader.getJobConf()).setInputPath(reader.getSplit().getPath()); - return next(key, value); - } else { - return false; - } + return false; } } @Override public NullWritable createKey() { if (useFileGroupReader) { - return this.currentRecordReaderFG.createKey(); + return ((HoodieFileGroupReaderRecordReader) this.currentRecordReader).createKey(); } else { - return this.currentRecordReader.createKey(); + return ((HoodieRealtimeRecordReader) this.currentRecordReader).createKey(); } } @Override public ArrayWritable createValue() { if (useFileGroupReader) { - return this.currentRecordReaderFG.createValue(); + return ((HoodieFileGroupReaderRecordReader) this.currentRecordReader).createValue(); } else { - return this.currentRecordReader.createValue(); + return ((HoodieRealtimeRecordReader) this.currentRecordReader).createValue(); } } @Override public long getPos() throws IOException { - if (useFileGroupReader) { - return this.currentRecordReaderFG.getPos(); - } else { - return this.currentRecordReader.getPos(); - } + return this.currentRecordReader.getPos(); } @Override public void close() throws IOException { - if (useFileGroupReader) { - this.currentRecordReaderFG.close(); - } else { - this.currentRecordReader.close(); - } + this.currentRecordReader.close(); } @Override public float getProgress() throws IOException { - if (useFileGroupReader) { - return this.currentRecordReaderFG.getProgress(); - } else { - return this.currentRecordReader.getProgress(); - } + return this.currentRecordReader.getProgress(); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java index e5e1f125e6f7d..ddcc28851dfd4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/ObjectInspectorCache.java @@ -40,6 +40,10 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +/** + * To read value from an ArrayWritable, an ObjectInspector is needed. + * Object inspectors are cached here or created using the column type map. + */ public class ObjectInspectorCache { private final Map columnTypeMap = new HashMap<>(); private final Cache diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java new file mode 100644 index 0000000000000..12676c3ba18da --- /dev/null +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop.utils; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.mapred.JobConf; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.function.UnaryOperator; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHoodieArrayWritableAvroUtils { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + Schema tableSchema = HoodieTestDataGenerator.AVRO_SCHEMA; + ObjectInspectorCache objectInspectorCache; + + @BeforeEach + public void setup() { + List fields = tableSchema.getFields(); + Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); + JobConf jobConf = new JobConf(conf); + jobConf.set(serdeConstants.LIST_COLUMNS, fields.stream().map(Schema.Field::name).collect(Collectors.joining(","))); + jobConf.set(serdeConstants.LIST_COLUMN_TYPES, HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES); + objectInspectorCache = new ObjectInspectorCache(HoodieTestDataGenerator.AVRO_SCHEMA, jobConf); + } + + @Test + public void testProjection() { + Schema from = tableSchema; + Schema to = HoodieAvroUtils.generateProjectionSchema(from, Arrays.asList("trip_type", "current_ts", "weight")); + UnaryOperator projection = HoodieArrayWritableAvroUtils.projectRecord(from, to); + UnaryOperator reverseProjection = HoodieArrayWritableAvroUtils.reverseProject(to, from); + + //We reuse the ArrayWritable, so we need to get the values before projecting + ArrayWritable record = convertArrayWritable(dataGen.generateGenericRecord()); + Object tripType = objectInspectorCache.getValue(record, from, "trip_type"); + Object currentTs = objectInspectorCache.getValue(record, from, "current_ts"); + Object weight = objectInspectorCache.getValue(record, from, "weight"); + + //Make sure the projected fields can be read + ArrayWritable projectedRecord = projection.apply(record); + assertEquals(tripType, objectInspectorCache.getValue(projectedRecord, to, "trip_type")); + assertEquals(currentTs, objectInspectorCache.getValue(projectedRecord, to, "current_ts")); + assertEquals(weight, objectInspectorCache.getValue(projectedRecord, to, "weight")); + + //Reverse projection, the fields are in the original spots, but only the fields we set can be read. + //Therefore, we can only check the 3 fields that were in the projection + ArrayWritable reverseProjected = reverseProjection.apply(projectedRecord); + assertEquals(tripType, objectInspectorCache.getValue(reverseProjected, from, "trip_type")); + assertEquals(currentTs, objectInspectorCache.getValue(reverseProjected, from, "current_ts")); + assertEquals(weight, objectInspectorCache.getValue(reverseProjected, from, "weight")); + } + + private static ArrayWritable convertArrayWritable(GenericRecord record) { + return (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(record, record.getSchema(), false); + } +} diff --git a/packaging/bundle-validation/validate.sh b/packaging/bundle-validation/validate.sh index d500ea794bd21..d81f3771f0bf5 100755 --- a/packaging/bundle-validation/validate.sh +++ b/packaging/bundle-validation/validate.sh @@ -97,7 +97,7 @@ test_spark_hadoop_mr_bundles () { # save HiveQL query results hiveqlresultsdir=/tmp/hadoop-mr-bundle/hiveql/trips/results mkdir -p $hiveqlresultsdir - $HIVE_HOME/bin/beeline --verbose --hiveconf hive.input.format=org.apache.hudi.hadoop.HoodieParquetInputFormat \ + $HIVE_HOME/bin/beeline --hiveconf hive.input.format=org.apache.hudi.hadoop.HoodieParquetInputFormat \ -u jdbc:hive2://localhost:10000/default --showHeader=false --outputformat=csv2 \ -e 'select * from trips' >> $hiveqlresultsdir/results.csv numRecordsHiveQL=$(cat $hiveqlresultsdir/*.csv | wc -l) From 6d58742a84e6ca64152cf1000990afe529799cc4 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Fri, 19 Jan 2024 14:27:14 -0500 Subject: [PATCH 31/41] accidently put remove in for loop for combine reader --- .../hadoop/realtime/HoodieCombineRealtimeRecordReader.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java index 38bf212f3fcc1..034d7c6b69c86 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java @@ -67,9 +67,8 @@ public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split recordReaders.add( new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) rtSplit, jobConf, readers.remove(0))); } - currentRecordReader = recordReaders.remove(0); } - + currentRecordReader = recordReaders.remove(0); } catch (Exception e) { throw new RuntimeException(e); } From 325a8b7d556707b926d3bb920c457c014412e5f0 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Mon, 29 Jan 2024 11:04:55 -0500 Subject: [PATCH 32/41] get building again --- ...ocker-compose_hadoop284_hive233_spark244_mac_aarch64.yml | 6 +----- .../org/apache/hudi/hadoop/HiveHoodieReaderContext.java | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml index e2a95c09b50d6..0abcf676d5f75 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml @@ -129,11 +129,10 @@ services: - ./hadoop.env environment: SERVICE_PRECONDITION: "hivemetastore:9083" - JAVA_TOOL_OPTIONS: "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005" ports: - "10000:10000" # JVM debugging port (will be mapped to a random port on host) - - "64757:5005" + - "5005" depends_on: - "hivemetastore" links: @@ -142,7 +141,6 @@ services: - "namenode" volumes: - ${HUDI_WS}:/var/hoodie/ws - - /Users/jon/Desktop/hiveWorkload:/var/hiveWorkload sparkmaster: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:linux-arm64-0.10.1 @@ -229,7 +227,6 @@ services: - "namenode" volumes: - ${HUDI_WS}:/var/hoodie/ws - - /Users/jon/Desktop/hiveWorkload:/var/hiveWorkload adhoc-2: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1 @@ -252,7 +249,6 @@ services: - "namenode" volumes: - ${HUDI_WS}:/var/hoodie/ws - - /Users/jon/Desktop/hiveWorkload:/var/hiveWorkload volumes: namenode: diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 0f5b5f8884ea9..9f06b528f081e 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -21,7 +21,6 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.engine.HoodieReaderContext; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieEmptyRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -33,6 +32,7 @@ import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.CloseableMappingIterator; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.hadoop.utils.HoodieArrayWritableAvroUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; import org.apache.hudi.hadoop.utils.ObjectInspectorCache; @@ -114,7 +114,7 @@ private static String assertSingleKey(Option recordKeyFieldsOpt) { @Override public FileSystem getFs(String path, Configuration conf) { - return FSUtils.getFs(path, conf); + return HadoopFSUtils.getFs(path, conf); } @Override From a012522b44629e437bd66301f6af41f84a9cefb5 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Mon, 29 Jan 2024 12:11:36 -0500 Subject: [PATCH 33/41] address some review comments --- .../org/apache/hudi/hadoop/HiveHoodieReaderContext.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 9f06b528f081e..b9c5a5784f072 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -158,8 +158,7 @@ private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchem @Override public ArrayWritable convertAvroRecord(IndexedRecord avroRecord) { - //should be support timestamp? - return (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(avroRecord, avroRecord.getSchema(), false); + return (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(avroRecord, avroRecord.getSchema(), true); } @Override @@ -247,14 +246,14 @@ public long getPos() throws IOException { if (firstRecordReader != null) { return firstRecordReader.getPos(); } - return 0; + throw new IllegalStateException("getPos() should not be called before a record reader has been initialized"); } public float getProgress() throws IOException { if (firstRecordReader != null) { return firstRecordReader.getProgress(); } - return 0; + throw new IllegalStateException("getProgress() should not be called before a record reader has been initialized"); } } From 50bd39fdef07d5a775616e52d7c97185db83943c Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Fri, 2 Feb 2024 14:48:01 -0500 Subject: [PATCH 34/41] add reviewer suggested change --- .../java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 31b0dfc087c5a..b950bdfefb577 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -109,7 +109,7 @@ private static boolean checkIfHudiTable(final InputSplit split, final JobConf jo @Override public RecordReader getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { - + HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/")); if (HoodieFileGroupReaderRecordReader.useFilegroupReader(job)) { try { if (!(split instanceof FileSplit) || !checkIfHudiTable(split, job)) { @@ -153,7 +153,6 @@ public RecordReader getRecordReader(final InputSpli LOG.debug("EMPLOYING DEFAULT RECORD READER - " + split); } - HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/")); return getRecordReaderInternal(split, job, reporter); } From 50a308dbe77e1e84ab8976d2f32e32c47780fd36 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Mon, 5 Feb 2024 12:23:37 -0500 Subject: [PATCH 35/41] add missing params fg reader --- .../HoodieFileGroupReaderRecordReader.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index 230e5257d7e28..d8a45ed6b0587 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -31,9 +31,11 @@ import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.read.HoodieFileGroupReader; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.TablePathUtils; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader; import org.apache.hudi.hadoop.realtime.RealtimeSplit; import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; @@ -67,6 +69,11 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.config.HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED; +import static org.apache.hudi.common.config.HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE; +import static org.apache.hudi.common.config.HoodieMemoryConfig.MAX_MEMORY_FOR_MERGE; +import static org.apache.hudi.common.config.HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH; + public class HoodieFileGroupReaderRecordReader implements RecordReader { public interface HiveReaderCreator { @@ -106,10 +113,18 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, Map hosts = new HashMap<>(); this.readerContext = new HiveHoodieReaderContext(readerCreator, split, jobConfCopy, reporter, tableSchema, hosts, metaClient); this.arrayWritable = new ArrayWritable(Writable.class, new Writable[requestedSchema.getFields().size()]); + //get some config vals + long maxMemoryForMerge = jobConf.getLong(MAX_MEMORY_FOR_MERGE.key(), MAX_MEMORY_FOR_MERGE.defaultValue()); + String spillableMapPath = jobConf.get(SPILLABLE_MAP_BASE_PATH.key(), FileIOUtils.getDefaultSpillableMapBasePath()); + ExternalSpillableMap.DiskMapType spillMapType = ExternalSpillableMap.DiskMapType.valueOf(jobConf.get(SPILLABLE_DISK_MAP_TYPE.key(), + SPILLABLE_DISK_MAP_TYPE.defaultValue().name()).toUpperCase(Locale.ROOT)); + boolean bitmaskCompressEnabled = jobConf.getBoolean(DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), + DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()); + this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, jobConfCopy, tableBasePath, latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(tableBasePath, jobConfCopy), tableBasePath), tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), - fileSplit.getLength(), false); + fileSplit.getLength(), false, maxMemoryForMerge, spillableMapPath,spillMapType, bitmaskCompressEnabled); this.fileGroupReader.initRecordIterators(); //it expects the partition columns to be at the end Schema outputSchema = HoodieAvroUtils.generateProjectionSchema(tableSchema, From 5a33e85d40fa3f39cfc2e6d11324a266ae00d3e6 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Tue, 20 Feb 2024 10:50:37 -0500 Subject: [PATCH 36/41] address some comments --- .../apache/hudi/hadoop/HiveHoodieReaderContext.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index b9c5a5784f072..03f77d8481185 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -106,9 +106,13 @@ protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCr this.columnTypeMap = objectInspectorCache.getColumnTypeMap(); } + /** + * If populate meta fields is false, then getRecordKeyFields() + * should return exactly 1 recordkey field. + */ private static String assertSingleKey(Option recordKeyFieldsOpt) { - ValidationUtils.checkArgument(recordKeyFieldsOpt.isPresent(), "no record key field"); - ValidationUtils.checkArgument(recordKeyFieldsOpt.get().length == 1, "more than 1 record key, and not meta fields"); + ValidationUtils.checkArgument(recordKeyFieldsOpt.isPresent(), "No record key field set in table config, but populateMetaFields is disabled"); + ValidationUtils.checkArgument(recordKeyFieldsOpt.get().length == 1, "More than 1 record key set in table config, but populateMetaFields is disabled"); return recordKeyFieldsOpt.get()[0]; } @@ -132,7 +136,7 @@ public ClosableIterator getFileRecordIterator(Path filePath, long } ClosableIterator recordIterator = new RecordReaderValueIterator<>(recordReader); if (modifiedDataSchema.equals(requiredSchema)) { - return recordIterator; + return recordIterator; } //The record reader puts the required columns in the positions of the data schema and nulls the rest of the columns return new CloseableMappingIterator<>(recordIterator, projectRecord(modifiedDataSchema, requiredSchema)); From 18fbd92eec10c49025db364be79cc9dbfccee362 Mon Sep 17 00:00:00 2001 From: Sagar Sumit Date: Fri, 31 May 2024 16:41:06 +0530 Subject: [PATCH 37/41] Fix post rebase --- .../apache/hudi/hadoop/fs/HadoopFSUtils.java | 15 ++++- .../hudi/hadoop/HiveHoodieReaderContext.java | 67 ++++++++++--------- .../HoodieFileGroupReaderRecordReader.java | 62 ++++++++++------- .../hudi/hadoop/HoodieParquetInputFormat.java | 18 ++--- .../HoodieParquetRealtimeInputFormat.java | 8 ++- .../hadoop/TestHoodieParquetInputFormat.java | 25 ++++--- .../TestHoodieArrayWritableAvroUtils.java | 2 +- 7 files changed, 114 insertions(+), 83 deletions(-) diff --git a/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java b/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java index 9b087482c72c8..e8e92f6b42044 100644 --- a/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java +++ b/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java @@ -61,6 +61,8 @@ import java.util.regex.Matcher; import java.util.stream.Collectors; +import static org.apache.hudi.common.fs.FSUtils.LOG_FILE_PATTERN; + /** * Utility functions related to accessing the file storage on Hadoop. */ @@ -377,13 +379,24 @@ public static String getRelativePartitionPath(Path basePath, Path fullPartitionP * the file name. */ public static String getFileIdFromLogPath(Path path) { - Matcher matcher = FSUtils.LOG_FILE_PATTERN.matcher(path.getName()); + Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); if (!matcher.find()) { throw new InvalidHoodiePathException(path.toString(), "LogFile"); } return matcher.group(1); } + /** + * Get the second part of the file name in the log file. That will be the delta commit time. + */ + public static String getDeltaCommitTimeFromLogPath(Path path) { + Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); + if (!matcher.find()) { + throw new InvalidHoodiePathException(path.toString(), "LogFile"); + } + return matcher.group(2); + } + /** * Check if the file is a base file of a log file. Then get the fileId appropriately. */ diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index 03f77d8481185..b1c93e58e730a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -32,15 +32,16 @@ import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.CloseableMappingIterator; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.hadoop.utils.HoodieArrayWritableAvroUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; import org.apache.hudi.hadoop.utils.ObjectInspectorCache; +import org.apache.hudi.storage.HoodieStorage; +import org.apache.hudi.storage.HoodieStorageUtils; +import org.apache.hudi.storage.StorageConfiguration; +import org.apache.hudi.storage.StoragePath; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; @@ -73,7 +74,7 @@ public class HiveHoodieReaderContext extends HoodieReaderContext protected final JobConf jobConf; protected final Reporter reporter; protected final Schema writerSchema; - protected Map hosts; + protected Map hosts; protected final Map columnTypeMap; private final ObjectInspectorCache objectInspectorCache; private RecordReader firstRecordReader = null; @@ -82,12 +83,13 @@ public class HiveHoodieReaderContext extends HoodieReaderContext private final Set partitionColSet; private final String recordKeyField; + protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator, InputSplit split, JobConf jobConf, Reporter reporter, Schema writerSchema, - Map hosts, + Map hosts, HoodieTableMetaClient metaClient) { this.readerCreator = readerCreator; this.split = split; @@ -96,7 +98,7 @@ protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCr this.writerSchema = writerSchema; this.hosts = hosts; this.partitionCols = HoodieFileGroupReaderRecordReader.getPartitionFieldNames(jobConf).stream() - .filter(n -> writerSchema.getField(n) != null).collect(Collectors.toList()); + .filter(n -> writerSchema.getField(n) != null).collect(Collectors.toList()); this.partitionColSet = new HashSet<>(this.partitionCols); String tableName = metaClient.getTableConfig().getTableName(); recordKeyField = metaClient.getTableConfig().populateMetaFields() @@ -116,20 +118,38 @@ private static String assertSingleKey(Option recordKeyFieldsOpt) { return recordKeyFieldsOpt.get()[0]; } + private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchema) { + List dataColumnNameList = dataSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).collect(Collectors.toList()); + List dataColumnTypeList = dataColumnNameList.stream().map(fieldName -> { + TypeInfo type = columnTypeMap.get(fieldName); + if (type == null) { + throw new IllegalArgumentException("Field: " + fieldName + ", does not have a defined type"); + } + return type; + }).collect(Collectors.toList()); + jobConf.set(serdeConstants.LIST_COLUMNS, String.join(",", dataColumnNameList)); + jobConf.set(serdeConstants.LIST_COLUMN_TYPES, dataColumnTypeList.stream().map(TypeInfo::getQualifiedName).collect(Collectors.joining(","))); + //don't replace `f -> f.name()` with lambda reference + String readColNames = requiredSchema.getFields().stream().map(f -> f.name()).collect(Collectors.joining(",")); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, requiredSchema.getFields() + .stream().map(f -> String.valueOf(dataSchema.getField(f.name()).pos())).collect(Collectors.joining(","))); + } + @Override - public FileSystem getFs(String path, Configuration conf) { - return HadoopFSUtils.getFs(path, conf); + public HoodieStorage getStorage(String path, StorageConfiguration conf) { + return HoodieStorageUtils.getStorage(path, conf); } @Override - public ClosableIterator getFileRecordIterator(Path filePath, long start, long length, Schema dataSchema, Schema requiredSchema, Configuration conf) throws IOException { + public ClosableIterator getFileRecordIterator(StoragePath filePath, long start, long length, Schema dataSchema, Schema requiredSchema, HoodieStorage storage) throws IOException { JobConf jobConfCopy = new JobConf(jobConf); //move the partition cols to the end, because in some cases it has issues if we don't do that Schema modifiedDataSchema = HoodieAvroUtils.generateProjectionSchema(dataSchema, Stream.concat(dataSchema.getFields().stream() .map(f -> f.name().toLowerCase(Locale.ROOT)).filter(n -> !partitionColSet.contains(n)), partitionCols.stream().filter(c -> dataSchema.getField(c) != null)).collect(Collectors.toList())); setSchemas(jobConfCopy, modifiedDataSchema, requiredSchema); - InputSplit inputSplit = new FileSplit(filePath, start, length, hosts.get(filePath.toString())); + InputSplit inputSplit = new FileSplit(new Path(filePath.toString()), start, length, hosts.get(filePath.toString())); RecordReader recordReader = readerCreator.getRecordReader(inputSplit, jobConfCopy, reporter); if (firstRecordReader == null) { firstRecordReader = recordReader; @@ -142,24 +162,6 @@ public ClosableIterator getFileRecordIterator(Path filePath, long return new CloseableMappingIterator<>(recordIterator, projectRecord(modifiedDataSchema, requiredSchema)); } - private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchema) { - List dataColumnNameList = dataSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).collect(Collectors.toList()); - List dataColumnTypeList = dataColumnNameList.stream().map(fieldName -> { - TypeInfo type = columnTypeMap.get(fieldName); - if (type == null) { - throw new IllegalArgumentException("Field: " + fieldName + ", does not have a defined type"); - } - return type; - }).collect(Collectors.toList()); - jobConf.set(serdeConstants.LIST_COLUMNS, String.join(",", dataColumnNameList)); - jobConf.set(serdeConstants.LIST_COLUMN_TYPES, dataColumnTypeList.stream().map(TypeInfo::getQualifiedName).collect(Collectors.joining(","))); - //don't replace `f -> f.name()` with lambda reference - String readColNames = requiredSchema.getFields().stream().map(f -> f.name()).collect(Collectors.joining(",")); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames); - jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, requiredSchema.getFields() - .stream().map(f -> String.valueOf(dataSchema.getField(f.name()).pos())).collect(Collectors.joining(","))); - } - @Override public ArrayWritable convertAvroRecord(IndexedRecord avroRecord) { return (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(avroRecord, avroRecord.getSchema(), true); @@ -167,12 +169,10 @@ public ArrayWritable convertAvroRecord(IndexedRecord avroRecord) { @Override public HoodieRecordMerger getRecordMerger(String mergerStrategy) { - switch (mergerStrategy) { - case DEFAULT_MERGER_STRATEGY_UUID: - return new HoodieHiveRecordMerger(); - default: - throw new HoodieException("The merger strategy UUID is not supported: " + mergerStrategy); + if (mergerStrategy.equals(DEFAULT_MERGER_STRATEGY_UUID)) { + return new HoodieHiveRecordMerger(); } + throw new HoodieException("The merger strategy UUID is not supported, Default: %s, Passed: " + mergerStrategy); } @Override @@ -210,6 +210,7 @@ public ClosableIterator mergeBootstrapReaders(ClosableIterator() { private final ArrayWritable returnWritable = new ArrayWritable(Writable.class); + @Override public boolean hasNext() { if (dataFileIterator.hasNext() != skeletonFileIterator.hasNext()) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java index d8a45ed6b0587..62f36864c99a9 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java @@ -22,7 +22,6 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.config.HoodieReaderConfig; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; @@ -40,6 +39,9 @@ import org.apache.hudi.hadoop.realtime.RealtimeSplit; import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; +import org.apache.hudi.storage.HoodieStorage; +import org.apache.hudi.storage.StoragePath; +import org.apache.hudi.storage.hadoop.HoodieHadoopStorage; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; @@ -73,8 +75,18 @@ import static org.apache.hudi.common.config.HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE; import static org.apache.hudi.common.config.HoodieMemoryConfig.MAX_MEMORY_FOR_MERGE; import static org.apache.hudi.common.config.HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH; +import static org.apache.hudi.common.fs.FSUtils.getCommitTime; +import static org.apache.hudi.common.fs.FSUtils.getFileId; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePathInfo; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getDeltaCommitTimeFromLogPath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getFileIdFromLogPath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getFs; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getRelativePartitionPath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getStorageConf; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.isLogFile; -public class HoodieFileGroupReaderRecordReader implements RecordReader { +public class HoodieFileGroupReaderRecordReader implements RecordReader { public interface HiveReaderCreator { org.apache.hadoop.mapred.RecordReader getRecordReader( @@ -104,7 +116,7 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, FileSplit fileSplit = (FileSplit) split; String tableBasePath = getTableBasePath(split, jobConfCopy); HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() - .setConf(jobConfCopy) + .setConf(getStorageConf(jobConfCopy)) .setBasePath(tableBasePath) .build(); String latestCommitTime = getLatestCommitTime(split, metaClient); @@ -116,15 +128,15 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, //get some config vals long maxMemoryForMerge = jobConf.getLong(MAX_MEMORY_FOR_MERGE.key(), MAX_MEMORY_FOR_MERGE.defaultValue()); String spillableMapPath = jobConf.get(SPILLABLE_MAP_BASE_PATH.key(), FileIOUtils.getDefaultSpillableMapBasePath()); - ExternalSpillableMap.DiskMapType spillMapType = ExternalSpillableMap.DiskMapType.valueOf(jobConf.get(SPILLABLE_DISK_MAP_TYPE.key(), + ExternalSpillableMap.DiskMapType spillMapType = ExternalSpillableMap.DiskMapType.valueOf(jobConf.get(SPILLABLE_DISK_MAP_TYPE.key(), SPILLABLE_DISK_MAP_TYPE.defaultValue().name()).toUpperCase(Locale.ROOT)); boolean bitmaskCompressEnabled = jobConf.getBoolean(DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()); - this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, jobConfCopy, tableBasePath, - latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, readerContext.getFs(tableBasePath, jobConfCopy), tableBasePath), + this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, metaClient.getStorage(), tableBasePath, + latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, getFs(tableBasePath, jobConfCopy), tableBasePath), tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), - fileSplit.getLength(), false, maxMemoryForMerge, spillableMapPath,spillMapType, bitmaskCompressEnabled); + fileSplit.getLength(), false, maxMemoryForMerge, spillableMapPath, spillMapType, bitmaskCompressEnabled); this.fileGroupReader.initRecordIterators(); //it expects the partition columns to be at the end Schema outputSchema = HoodieAvroUtils.generateProjectionSchema(tableSchema, @@ -178,8 +190,7 @@ public JobConf getJobConf() { public static List getPartitionFieldNames(JobConf jobConf) { String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); - return partitionFields.length() > 0 ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()) - : new ArrayList<>(); + return partitionFields.isEmpty() ? new ArrayList<>() : Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()); } private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, JobConf jobConf, String latestCommitTime) { @@ -198,9 +209,10 @@ public static String getTableBasePath(InputSplit split, JobConf jobConf) throws RealtimeSplit realtimeSplit = (RealtimeSplit) split; return realtimeSplit.getBasePath(); } else { - Path inputPath = ((FileSplit)split).getPath(); - FileSystem fs = inputPath.getFileSystem(jobConf); - Option tablePath = TablePathUtils.getTablePath(fs, inputPath); + Path inputPath = ((FileSplit) split).getPath(); + FileSystem fs = inputPath.getFileSystem(jobConf); + HoodieStorage storage = new HoodieHadoopStorage(fs); + Option tablePath = TablePathUtils.getTablePath(storage, convertToStoragePath(inputPath)); return tablePath.get().toString(); } } @@ -225,30 +237,32 @@ private static FileSlice getFileSliceFromSplit(FileSplit split, Map hosts, FileSystem fs) throws IOException { @@ -256,7 +270,7 @@ private static BaseFile createBootstrapBaseFile(FileSplit split, Map tablePathOpt = TablePathUtils.getTablePath(((FileSplit) split).getPath(), job); - if (!tablePathOpt.isPresent()) { - return false; - } - return tablePathOpt.get().getFileSystem(job).exists(new Path(tablePathOpt.get(), HoodieTableMetaClient.METAFOLDER_NAME)); + Path inputPath = ((FileSplit) split).getPath(); + FileSystem fs = inputPath.getFileSystem(job); + HoodieStorage storage = new HoodieHadoopStorage(fs); + return isHoodieTablePath(storage, convertToStoragePath(inputPath)); } catch (IOException e) { return false; } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index 3974a4c7e3bd4..c8be61423c90f 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -18,7 +18,6 @@ package org.apache.hudi.hadoop.realtime; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; @@ -46,6 +45,9 @@ import java.util.Arrays; import java.util.List; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getStorageConf; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.isLogFile; + /** * Input Format, that provides a real-time view of data in a Hoodie table. */ @@ -76,14 +78,14 @@ public RecordReader getRecordReader(final InputSpli } // add preCombineKey - HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(realtimeSplit.getBasePath()).build(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(getStorageConf(jobConf)).setBasePath(realtimeSplit.getBasePath()).build(); HoodieTableConfig tableConfig = metaClient.getTableConfig(); addProjectionToJobConf(realtimeSplit, jobConf, tableConfig); LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // for log only split, set the parquet reader as empty. - if (FSUtils.isLogFile(realtimeSplit.getPath())) { + if (isLogFile(realtimeSplit.getPath())) { return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, new HoodieEmptyRecordReader(realtimeSplit, jobConf)); } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java index e13d284606f12..7d7a2eec626c9 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java @@ -369,7 +369,7 @@ public void testIncrementalSimple() throws IOException { files = inputFormat.listStatus(jobConf); assertEquals(10, files.length, "When hoodie.incremental.use.database is true and hoodie.database.name is not null or empty" - + " and the incremental database name is not set, then the incremental query will not take effect"); + + " and the incremental database name is not set, then the incremental query will not take effect"); } @Test @@ -404,7 +404,7 @@ public void testIncrementalWithDatabaseName() throws IOException { metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultStorageConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, baseFileFormat, HoodieTestUtils.HOODIE_DATABASE); assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(), - String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE)); + String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE)); files = inputFormat.listStatus(jobConf); assertEquals(0, files.length, @@ -415,7 +415,7 @@ public void testIncrementalWithDatabaseName() throws IOException { files = inputFormat.listStatus(jobConf); assertEquals(10, files.length, "When hoodie.incremental.use.database is false and the incremental database name is set, " - + "then the incremental query will not take effect"); + + "then the incremental query will not take effect"); // The configuration with and without database name exists together InputFormatTestUtil.setupIncremental(jobConf, "1", 1, true); @@ -423,13 +423,13 @@ public void testIncrementalWithDatabaseName() throws IOException { files = inputFormat.listStatus(jobConf); assertEquals(0, files.length, "When hoodie.incremental.use.database is true, " - + "We should exclude commit 100 because the returning incremental pull with start commit time is 100"); + + "We should exclude commit 100 because the returning incremental pull with start commit time is 100"); InputFormatTestUtil.setupIncremental(jobConf, "1", 1, false); files = inputFormat.listStatus(jobConf); assertEquals(10, files.length, "When hoodie.incremental.use.database is false, " - + "We should include commit 100 because the returning incremental pull with start commit time is 1"); + + "We should include commit 100 because the returning incremental pull with start commit time is 1"); } @Test @@ -680,13 +680,13 @@ public void testSnapshotPreCommitValidate() throws IOException { try { // Verify that Validate mode throws error with invalid commit time - InputFormatTestUtil.setupSnapshotIncludePendingCommits(jobConf, "300"); + InputFormatTestUtil.setupSnapshotIncludePendingCommits(jobConf, "300"); inputFormat.listStatus(jobConf); fail("Expected list status to fail when validate is called with unknown timestamp"); } catch (HoodieIOException e) { // expected because validate is called with invalid instantTime } - + //Creating a new jobCOnf Object because old one has hoodie.%.consume.commit set jobConf = new JobConf(); inputFormat.setConf(jobConf); @@ -752,7 +752,7 @@ public void testInputFormatLoadForEmptyPartitionedTable() throws IOException { } private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit, - int totalExpected) throws IOException { + int totalExpected) throws IOException { int actualCount = 0; int totalCount = 0; InputSplit[] splits = inputFormat.getSplits(jobConf, 1); @@ -823,10 +823,10 @@ public void testHoodieParquetInputFormatReadTimeType() throws IOException { assertEquals(Timestamp.valueOf(localDateTime).toString(), String.valueOf(writable.get()[0])); } else { Date date = new Date(); - date.setTime(testTimestampLong); - Timestamp actualTime = ((TimestampWritable) writable.get()[0]).getTimestamp(); - SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); - assertEquals(dateFormat.format(date), dateFormat.format(actualTime)); + date.setTime(testTimestampLong); + Timestamp actualTime = ((TimestampWritable) writable.get()[0]).getTimestamp(); + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); + assertEquals(dateFormat.format(date), dateFormat.format(actualTime)); } // test long assertEquals(testTimestampLong * 1000, ((LongWritable) writable.get()[1]).get()); @@ -837,6 +837,5 @@ public void testHoodieParquetInputFormatReadTimeType() throws IOException { } finally { jobConf.set(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "true"); } - } } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java index 12676c3ba18da..d7b4a93009b82 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/utils/TestHoodieArrayWritableAvroUtils.java @@ -48,7 +48,7 @@ public class TestHoodieArrayWritableAvroUtils { @BeforeEach public void setup() { List fields = tableSchema.getFields(); - Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); + Configuration conf = HoodieTestUtils.getDefaultStorageConf().unwrap(); JobConf jobConf = new JobConf(conf); jobConf.set(serdeConstants.LIST_COLUMNS, fields.stream().map(Schema.Field::name).collect(Collectors.joining(","))); jobConf.set(serdeConstants.LIST_COLUMN_TYPES, HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES); From 9487e127dd1b8d8f8dd3f36a79621c1cf55748e6 Mon Sep 17 00:00:00 2001 From: Sagar Sumit Date: Fri, 31 May 2024 17:58:48 +0530 Subject: [PATCH 38/41] fix hudi table check --- .../java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index cf0516e03bb10..104f6beda9c6f 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -51,6 +51,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import static org.apache.hudi.common.util.TablePathUtils.getTablePath; import static org.apache.hudi.common.util.TablePathUtils.isHoodieTablePath; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath; @@ -102,7 +103,8 @@ private static boolean checkIfHudiTable(final InputSplit split, final JobConf jo Path inputPath = ((FileSplit) split).getPath(); FileSystem fs = inputPath.getFileSystem(job); HoodieStorage storage = new HoodieHadoopStorage(fs); - return isHoodieTablePath(storage, convertToStoragePath(inputPath)); + return getTablePath(storage, convertToStoragePath(inputPath)) + .map(path -> isHoodieTablePath(storage, path)).orElse(false); } catch (IOException e) { return false; } From 2201cb0dea3acbe7597b319be7f14ce7a2a8543f Mon Sep 17 00:00:00 2001 From: Sagar Sumit Date: Fri, 31 May 2024 19:02:06 +0530 Subject: [PATCH 39/41] Address self feedback --- .../common/engine/HoodieReaderContext.java | 10 +-- .../hudi/hadoop/HiveHoodieReaderContext.java | 28 ++++--- ...odieFileGroupReaderBasedRecordReader.java} | 73 ++++++------------- .../apache/hudi/hadoop/HoodieHiveRecord.java | 25 ++----- .../hudi/hadoop/HoodieParquetInputFormat.java | 7 +- .../HoodieCombineRealtimeRecordReader.java | 16 ++-- .../HoodieParquetRealtimeInputFormat.java | 4 +- .../hadoop/utils/HoodieInputFormatUtils.java | 36 +++++++++ 8 files changed, 101 insertions(+), 98 deletions(-) rename hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/{HoodieFileGroupReaderRecordReader.java => HoodieFileGroupReaderBasedRecordReader.java} (82%) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java index 22ab06b354b3b..87336b30a3952 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java @@ -131,9 +131,9 @@ public String getRecordKey(T record, Schema schema) { * @return The ordering value. */ public Comparable getOrderingValue(Option recordOption, - Map metadataMap, - Schema schema, - TypedProperties props) { + Map metadataMap, + Schema schema, + TypedProperties props) { if (metadataMap.containsKey(INTERNAL_META_ORDERING_FIELD)) { return (Comparable) metadataMap.get(INTERNAL_META_ORDERING_FIELD); } @@ -216,7 +216,7 @@ public Map updateSchemaAndResetOrderingValInMetadata(Map mergeBootstrapReaders(ClosableIterator skeletonFileIterator, @@ -229,7 +229,7 @@ public abstract ClosableIterator mergeBootstrapReaders(ClosableIterator sk * all fields in "to" must be in "from", but not all fields in "from" must be in "to" * * @param from the schema of records to be passed into UnaryOperator - * @param to the schema of records produced by UnaryOperator + * @param to the schema of records produced by UnaryOperator * @return a function that takes in a record and returns the record with reordered columns */ public abstract UnaryOperator projectRecord(Schema from, Schema to); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java index b1c93e58e730a..64ea5fe5f0ca0 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieReaderContext.java @@ -67,9 +67,13 @@ import java.util.stream.Stream; import static org.apache.hudi.common.model.HoodieRecordMerger.DEFAULT_MERGER_STRATEGY_UUID; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getPartitionFieldNames; +/** + * {@link HoodieReaderContext} for Hive-specific {@link HoodieFileGroupReaderBasedRecordReader}. + */ public class HiveHoodieReaderContext extends HoodieReaderContext { - protected final HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator; + protected final HoodieFileGroupReaderBasedRecordReader.HiveReaderCreator readerCreator; protected final InputSplit split; protected final JobConf jobConf; protected final Reporter reporter; @@ -84,7 +88,7 @@ public class HiveHoodieReaderContext extends HoodieReaderContext private final String recordKeyField; - protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCreator readerCreator, + protected HiveHoodieReaderContext(HoodieFileGroupReaderBasedRecordReader.HiveReaderCreator readerCreator, InputSplit split, JobConf jobConf, Reporter reporter, @@ -97,13 +101,10 @@ protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCr this.reporter = reporter; this.writerSchema = writerSchema; this.hosts = hosts; - this.partitionCols = HoodieFileGroupReaderRecordReader.getPartitionFieldNames(jobConf).stream() - .filter(n -> writerSchema.getField(n) != null).collect(Collectors.toList()); + this.partitionCols = getPartitionFieldNames(jobConf).stream().filter(n -> writerSchema.getField(n) != null).collect(Collectors.toList()); this.partitionColSet = new HashSet<>(this.partitionCols); String tableName = metaClient.getTableConfig().getTableName(); - recordKeyField = metaClient.getTableConfig().populateMetaFields() - ? HoodieRecord.RECORD_KEY_METADATA_FIELD - : assertSingleKey(metaClient.getTableConfig().getRecordKeyFields()); + recordKeyField = getRecordKeyField(metaClient); this.objectInspectorCache = HoodieArrayWritableAvroUtils.getCacheForTable(tableName, writerSchema, jobConf); this.columnTypeMap = objectInspectorCache.getColumnTypeMap(); } @@ -112,7 +113,12 @@ protected HiveHoodieReaderContext(HoodieFileGroupReaderRecordReader.HiveReaderCr * If populate meta fields is false, then getRecordKeyFields() * should return exactly 1 recordkey field. */ - private static String assertSingleKey(Option recordKeyFieldsOpt) { + private static String getRecordKeyField(HoodieTableMetaClient metaClient) { + if (metaClient.getTableConfig().populateMetaFields()) { + return HoodieRecord.RECORD_KEY_METADATA_FIELD; + } + + Option recordKeyFieldsOpt = metaClient.getTableConfig().getRecordKeyFields(); ValidationUtils.checkArgument(recordKeyFieldsOpt.isPresent(), "No record key field set in table config, but populateMetaFields is disabled"); ValidationUtils.checkArgument(recordKeyFieldsOpt.get().length == 1, "More than 1 record key set in table config, but populateMetaFields is disabled"); return recordKeyFieldsOpt.get()[0]; @@ -129,7 +135,7 @@ private void setSchemas(JobConf jobConf, Schema dataSchema, Schema requiredSchem }).collect(Collectors.toList()); jobConf.set(serdeConstants.LIST_COLUMNS, String.join(",", dataColumnNameList)); jobConf.set(serdeConstants.LIST_COLUMN_TYPES, dataColumnTypeList.stream().map(TypeInfo::getQualifiedName).collect(Collectors.joining(","))); - //don't replace `f -> f.name()` with lambda reference + // don't replace `f -> f.name()` with lambda reference String readColNames = requiredSchema.getFields().stream().map(f -> f.name()).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, requiredSchema.getFields() @@ -158,7 +164,7 @@ public ClosableIterator getFileRecordIterator(StoragePath filePat if (modifiedDataSchema.equals(requiredSchema)) { return recordIterator; } - //The record reader puts the required columns in the positions of the data schema and nulls the rest of the columns + // record reader puts the required columns in the positions of the data schema and nulls the rest of the columns return new CloseableMappingIterator<>(recordIterator, projectRecord(modifiedDataSchema, requiredSchema)); } @@ -172,7 +178,7 @@ public HoodieRecordMerger getRecordMerger(String mergerStrategy) { if (mergerStrategy.equals(DEFAULT_MERGER_STRATEGY_UUID)) { return new HoodieHiveRecordMerger(); } - throw new HoodieException("The merger strategy UUID is not supported, Default: %s, Passed: " + mergerStrategy); + throw new HoodieException(String.format("The merger strategy UUID is not supported, Default: %s, Passed: %s", mergerStrategy, DEFAULT_MERGER_STRATEGY_UUID)); } @Override diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java similarity index 82% rename from hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java rename to hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java index 62f36864c99a9..d9c3764e2ea3d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileGroupReaderBasedRecordReader.java @@ -20,8 +20,6 @@ package org.apache.hudi.hadoop; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.config.HoodieCommonConfig; -import org.apache.hudi.common.config.HoodieReaderConfig; import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; @@ -33,15 +31,10 @@ import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.common.util.TablePathUtils; import org.apache.hudi.common.util.collection.ExternalSpillableMap; -import org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader; import org.apache.hudi.hadoop.realtime.RealtimeSplit; import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; -import org.apache.hudi.storage.HoodieStorage; -import org.apache.hudi.storage.StoragePath; -import org.apache.hudi.storage.hadoop.HoodieHadoopStorage; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; @@ -56,14 +49,14 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -77,7 +70,7 @@ import static org.apache.hudi.common.config.HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH; import static org.apache.hudi.common.fs.FSUtils.getCommitTime; import static org.apache.hudi.common.fs.FSUtils.getFileId; -import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath; +import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePathInfo; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getDeltaCommitTimeFromLogPath; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getFileIdFromLogPath; @@ -85,8 +78,15 @@ import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getRelativePartitionPath; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getStorageConf; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.isLogFile; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getPartitionFieldNames; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getTableBasePath; -public class HoodieFileGroupReaderRecordReader implements RecordReader { +/** + * {@link HoodieFileGroupReader} based implementation of Hive's {@link RecordReader} for {@link ArrayWritable}. + */ +public class HoodieFileGroupReaderBasedRecordReader implements RecordReader { + + private static final Logger LOG = LoggerFactory.getLogger(HoodieFileGroupReaderBasedRecordReader.class); public interface HiveReaderCreator { org.apache.hadoop.mapred.RecordReader getRecordReader( @@ -104,10 +104,10 @@ org.apache.hadoop.mapred.RecordReader getRecordRead private final JobConf jobConfCopy; private final UnaryOperator reverseProjection; - public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, - final InputSplit split, - final JobConf jobConf, - final Reporter reporter) throws IOException { + public HoodieFileGroupReaderBasedRecordReader(HiveReaderCreator readerCreator, + final InputSplit split, + final JobConf jobConf, + final Reporter reporter) throws IOException { this.jobConfCopy = new JobConf(jobConf); HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConfCopy); Set partitionColumns = new HashSet<>(getPartitionFieldNames(jobConfCopy)); @@ -125,20 +125,20 @@ public HoodieFileGroupReaderRecordReader(HiveReaderCreator readerCreator, Map hosts = new HashMap<>(); this.readerContext = new HiveHoodieReaderContext(readerCreator, split, jobConfCopy, reporter, tableSchema, hosts, metaClient); this.arrayWritable = new ArrayWritable(Writable.class, new Writable[requestedSchema.getFields().size()]); - //get some config vals + // get some config values long maxMemoryForMerge = jobConf.getLong(MAX_MEMORY_FOR_MERGE.key(), MAX_MEMORY_FOR_MERGE.defaultValue()); String spillableMapPath = jobConf.get(SPILLABLE_MAP_BASE_PATH.key(), FileIOUtils.getDefaultSpillableMapBasePath()); ExternalSpillableMap.DiskMapType spillMapType = ExternalSpillableMap.DiskMapType.valueOf(jobConf.get(SPILLABLE_DISK_MAP_TYPE.key(), SPILLABLE_DISK_MAP_TYPE.defaultValue().name()).toUpperCase(Locale.ROOT)); boolean bitmaskCompressEnabled = jobConf.getBoolean(DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()); - + LOG.debug("Creating HoodieFileGroupReaderRecordReader with tableBasePath={}, latestCommitTime={}, fileSplit={}", tableBasePath, latestCommitTime, fileSplit.getPath()); this.fileGroupReader = new HoodieFileGroupReader<>(readerContext, metaClient.getStorage(), tableBasePath, latestCommitTime, getFileSliceFromSplit(fileSplit, hosts, getFs(tableBasePath, jobConfCopy), tableBasePath), tableSchema, requestedSchema, metaClient.getTableConfig().getProps(), metaClient.getTableConfig(), fileSplit.getStart(), fileSplit.getLength(), false, maxMemoryForMerge, spillableMapPath, spillMapType, bitmaskCompressEnabled); this.fileGroupReader.initRecordIterators(); - //it expects the partition columns to be at the end + // it expects the partition columns to be at the end Schema outputSchema = HoodieAvroUtils.generateProjectionSchema(tableSchema, Stream.concat(tableSchema.getFields().stream().map(f -> f.name().toLowerCase(Locale.ROOT)).filter(n -> !partitionColumns.contains(n)), partitionColumns.stream()).collect(Collectors.toList())); @@ -188,11 +188,6 @@ public JobConf getJobConf() { return jobConfCopy; } - public static List getPartitionFieldNames(JobConf jobConf) { - String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); - return partitionFields.isEmpty() ? new ArrayList<>() : Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()); - } - private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, JobConf jobConf, String latestCommitTime) { TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); try { @@ -204,19 +199,6 @@ private static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, Job } } - public static String getTableBasePath(InputSplit split, JobConf jobConf) throws IOException { - if (split instanceof RealtimeSplit) { - RealtimeSplit realtimeSplit = (RealtimeSplit) split; - return realtimeSplit.getBasePath(); - } else { - Path inputPath = ((FileSplit) split).getPath(); - FileSystem fs = inputPath.getFileSystem(jobConf); - HoodieStorage storage = new HoodieHadoopStorage(fs); - Option tablePath = TablePathUtils.getTablePath(storage, convertToStoragePath(inputPath)); - return tablePath.get().toString(); - } - } - private static String getLatestCommitTime(InputSplit split, HoodieTableMetaClient metaClient) { if (split instanceof RealtimeSplit) { return ((RealtimeSplit) split).getMaxCommitTime(); @@ -225,7 +207,7 @@ private static String getLatestCommitTime(InputSplit split, HoodieTableMetaClien if (lastInstant.isPresent()) { return lastInstant.get().getTimestamp(); } else { - return ""; + return EMPTY_STRING; } } @@ -235,7 +217,7 @@ private static String getLatestCommitTime(InputSplit split, HoodieTableMetaClien private static FileSlice getFileSliceFromSplit(FileSplit split, Map hosts, FileSystem fs, String tableBasePath) throws IOException { BaseFile bootstrapBaseFile = createBootstrapBaseFile(split, hosts, fs); if (split instanceof RealtimeSplit) { - //mor + // MOR RealtimeSplit realtimeSplit = (RealtimeSplit) split; boolean isLogFile = isLogFile(realtimeSplit.getPath()); String fileID; @@ -255,7 +237,7 @@ private static FileSlice getFileSliceFromSplit(FileSplit split, Map partitionColumns; if (partitionColString == null) { @@ -291,18 +273,9 @@ private static Schema createRequestedSchema(Schema tableSchema, JobConf jobConf) } else { partitionColumns = Arrays.stream(partitionColString.split(",")).collect(Collectors.toSet()); } - //if they are actually written to the file, then it is ok to read them from the file + // if they are actually written to the file, then it is ok to read them from the file tableSchema.getFields().forEach(f -> partitionColumns.remove(f.name().toLowerCase(Locale.ROOT))); return HoodieAvroUtils.generateProjectionSchema(tableSchema, Arrays.stream(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(",")).filter(c -> !partitionColumns.contains(c)).collect(Collectors.toList())); } - - /** - * `schema.on.read` and skip merge not implemented - */ - public static boolean useFilegroupReader(final JobConf jobConf) { - return jobConf.getBoolean(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), HoodieReaderConfig.FILE_GROUP_READER_ENABLED.defaultValue()) - && !jobConf.getBoolean(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue()) - && !jobConf.getBoolean(HoodieRealtimeRecordReader.REALTIME_SKIP_MERGE_PROP, false); - } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java index 7efcd5fea75a1..a2fb08fd6146d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveRecord.java @@ -17,25 +17,6 @@ * under the License. */ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - package org.apache.hudi.hadoop; import org.apache.hudi.common.model.HoodieAvroIndexedRecord; @@ -65,10 +46,13 @@ import java.util.Map; import java.util.Properties; +/** + * {@link HoodieRecord} implementation for Hive records of {@link ArrayWritable}. + */ public class HoodieHiveRecord extends HoodieRecord { private boolean copy; - private boolean isDeleted; + private final boolean isDeleted; public boolean isDeleted() { return isDeleted; @@ -79,6 +63,7 @@ public boolean isDeleted() { private final ObjectInspectorCache objectInspectorCache; protected Schema schema; + public HoodieHiveRecord(HoodieKey key, ArrayWritable data, Schema schema, ObjectInspectorCache objectInspectorCache) { super(key, data); this.objectInspector = objectInspectorCache.getObjectInspector(schema); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 104f6beda9c6f..18b9e221978d1 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -54,6 +54,7 @@ import static org.apache.hudi.common.util.TablePathUtils.getTablePath; import static org.apache.hudi.common.util.TablePathUtils.isHoodieTablePath; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.shouldUseFilegroupReader; /** * HoodieInputFormat which understands the Hoodie File Structure and filters files based on the Hoodie Mode. If paths @@ -114,13 +115,13 @@ private static boolean checkIfHudiTable(final InputSplit split, final JobConf jo public RecordReader getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/")); - if (HoodieFileGroupReaderRecordReader.useFilegroupReader(job)) { + if (shouldUseFilegroupReader(job)) { try { if (!(split instanceof FileSplit) || !checkIfHudiTable(split, job)) { return super.getRecordReader(split, job, reporter); } if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { - return new HoodieFileGroupReaderRecordReader((s, j, r) -> { + return new HoodieFileGroupReaderBasedRecordReader((s, j, r) -> { try { return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), s, j, r); } catch (InterruptedException e) { @@ -128,7 +129,7 @@ public RecordReader getRecordReader(final InputSpli } }, split, job, reporter); } else { - return new HoodieFileGroupReaderRecordReader(super::getRecordReader, split, job, reporter); + return new HoodieFileGroupReaderBasedRecordReader(super::getRecordReader, split, job, reporter); } } catch (final IOException e) { throw new RuntimeException("Cannot create a RecordReaderWrapper", e); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java index 034d7c6b69c86..b89e69f4be8c6 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java @@ -19,7 +19,7 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.hadoop.HoodieFileGroupReaderRecordReader; +import org.apache.hudi.hadoop.HoodieFileGroupReaderBasedRecordReader; import org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit; import org.apache.hadoop.fs.Path; @@ -37,6 +37,8 @@ import java.util.LinkedList; import java.util.List; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.shouldUseFilegroupReader; + /** * Allows to read multiple realtime file splits grouped together by CombineInputFormat. */ @@ -52,7 +54,7 @@ public class HoodieCombineRealtimeRecordReader implements RecordReader readers) { - useFileGroupReader = HoodieFileGroupReaderRecordReader.useFilegroupReader(jobConf); + useFileGroupReader = shouldUseFilegroupReader(jobConf); try { ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers .size(), "Num Splits does not match number of unique RecordReaders!"); @@ -60,7 +62,7 @@ public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split if (useFileGroupReader) { LOG.info("Creating new HoodieFileGroupReaderRecordReader for split"); RecordReader reader = readers.remove(0); - ValidationUtils.checkArgument(reader instanceof HoodieFileGroupReaderRecordReader, reader.toString() + "not instance of HoodieFileGroupReaderRecordReader "); + ValidationUtils.checkArgument(reader instanceof HoodieFileGroupReaderBasedRecordReader, reader.toString() + "not instance of HoodieFileGroupReaderRecordReader "); recordReaders.add(reader); } else { LOG.info("Creating new RealtimeRecordReader for split"); @@ -86,8 +88,8 @@ public boolean next(NullWritable key, ArrayWritable value) throws IOException { Path path; if (useFileGroupReader) { reader = currentRecordReader; - jobConf = ((HoodieFileGroupReaderRecordReader) reader).getJobConf(); - path = ((HoodieFileGroupReaderRecordReader) reader).getSplit().getPath(); + jobConf = ((HoodieFileGroupReaderBasedRecordReader) reader).getJobConf(); + path = ((HoodieFileGroupReaderBasedRecordReader) reader).getSplit().getPath(); } else { reader = ((HoodieRealtimeRecordReader)currentRecordReader).getReader(); jobConf = ((AbstractRealtimeRecordReader) reader).getJobConf(); @@ -104,7 +106,7 @@ public boolean next(NullWritable key, ArrayWritable value) throws IOException { @Override public NullWritable createKey() { if (useFileGroupReader) { - return ((HoodieFileGroupReaderRecordReader) this.currentRecordReader).createKey(); + return ((HoodieFileGroupReaderBasedRecordReader) this.currentRecordReader).createKey(); } else { return ((HoodieRealtimeRecordReader) this.currentRecordReader).createKey(); } @@ -113,7 +115,7 @@ public NullWritable createKey() { @Override public ArrayWritable createValue() { if (useFileGroupReader) { - return ((HoodieFileGroupReaderRecordReader) this.currentRecordReader).createValue(); + return ((HoodieFileGroupReaderBasedRecordReader) this.currentRecordReader).createValue(); } else { return ((HoodieRealtimeRecordReader) this.currentRecordReader).createValue(); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index c8be61423c90f..8d56e77dda922 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -23,7 +23,6 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.hadoop.HoodieFileGroupReaderRecordReader; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; import org.apache.hudi.hadoop.UseRecordReaderFromInputFormat; @@ -47,6 +46,7 @@ import static org.apache.hudi.hadoop.fs.HadoopFSUtils.getStorageConf; import static org.apache.hudi.hadoop.fs.HadoopFSUtils.isLogFile; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.shouldUseFilegroupReader; /** * Input Format, that provides a real-time view of data in a Hoodie table. @@ -73,7 +73,7 @@ public RecordReader getRecordReader(final InputSpli "HoodieRealtimeRecordReader can only work on RealtimeSplit and not with " + split); RealtimeSplit realtimeSplit = (RealtimeSplit) split; - if (HoodieFileGroupReaderRecordReader.useFilegroupReader(jobConf)) { + if (shouldUseFilegroupReader(jobConf)) { return super.getRecordReader(realtimeSplit, jobConf, reporter); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java index fe88855d4581d..64dc1f63af8d1 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java @@ -18,7 +18,9 @@ package org.apache.hudi.hadoop.utils; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.HoodieReaderConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -33,6 +35,7 @@ import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.TablePathUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile; @@ -44,6 +47,8 @@ import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; import org.apache.hudi.hadoop.realtime.HoodieRealtimePath; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader; +import org.apache.hudi.hadoop.realtime.RealtimeSplit; import org.apache.hudi.storage.HoodieStorage; import org.apache.hudi.storage.HoodieStorageUtils; import org.apache.hudi.storage.StoragePath; @@ -52,8 +57,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcSerde; @@ -61,6 +68,8 @@ import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.slf4j.Logger; @@ -540,4 +549,31 @@ public static HoodieRealtimeFileSplit createRealtimeFileSplit(HoodieRealtimePath throw new HoodieIOException(String.format("Failed to create instance of %s", HoodieRealtimeFileSplit.class.getName()), e); } } + + public static List getPartitionFieldNames(JobConf jobConf) { + String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); + return partitionFields.isEmpty() ? new ArrayList<>() : Arrays.stream(partitionFields.split("/")).collect(Collectors.toList()); + } + + public static String getTableBasePath(InputSplit split, JobConf jobConf) throws IOException { + if (split instanceof RealtimeSplit) { + RealtimeSplit realtimeSplit = (RealtimeSplit) split; + return realtimeSplit.getBasePath(); + } else { + Path inputPath = ((FileSplit) split).getPath(); + FileSystem fs = inputPath.getFileSystem(jobConf); + HoodieStorage storage = new HoodieHadoopStorage(fs); + Option tablePath = TablePathUtils.getTablePath(storage, convertToStoragePath(inputPath)); + return tablePath.get().toString(); + } + } + + /** + * `schema.on.read` and skip merge not implemented + */ + public static boolean shouldUseFilegroupReader(final JobConf jobConf) { + return jobConf.getBoolean(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), HoodieReaderConfig.FILE_GROUP_READER_ENABLED.defaultValue()) + && !jobConf.getBoolean(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue()) + && !jobConf.getBoolean(HoodieRealtimeRecordReader.REALTIME_SKIP_MERGE_PROP, false); + } } From 33249cc712c6dcdde12efe8536579d3c9c5f8575 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Fri, 7 Jun 2024 15:44:46 -0400 Subject: [PATCH 40/41] fix failing test --- .../java/org/apache/hudi/client/TestPartitionTTLManagement.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestPartitionTTLManagement.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestPartitionTTLManagement.java index cda76154ca6a7..f4e9d206f06b1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestPartitionTTLManagement.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestPartitionTTLManagement.java @@ -182,7 +182,7 @@ private void writeRecordsForPartition(SparkRDDWriteClient client, HoodieTestData private List readRecords(String[] partitions) { return HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(storageConf, Arrays.stream(partitions).map(p -> Paths.get(basePath, p).toString()).collect(Collectors.toList()), - basePath, new JobConf(storageConf.unwrap()), true, false); + basePath, new JobConf(storageConf.unwrap()), true, true); } } From e95bcb80e4b729677ef65be41abc30e8c4ce5c03 Mon Sep 17 00:00:00 2001 From: Jonathan Vexler <=> Date: Sat, 8 Jun 2024 18:43:48 -0400 Subject: [PATCH 41/41] fix failing test --- .../utilities/streamer/TestHoodieStreamerUtils.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieStreamerUtils.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieStreamerUtils.java index e6c388b3e3b12..e00d729009412 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieStreamerUtils.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieStreamerUtils.java @@ -32,12 +32,14 @@ import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import org.mockito.ArgumentCaptor; import org.mockito.Mockito; import java.util.Collections; import java.util.List; +import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.doNothing; @@ -56,8 +58,15 @@ public static void setupOnce() throws Exception { initTestServices(); } + private static Stream validRecordTypes() { + Stream.Builder b = Stream.builder(); + b.add(Arguments.of(HoodieRecordType.SPARK)); + b.add(Arguments.of(HoodieRecordType.AVRO)); + return b.build(); + } + @ParameterizedTest - @EnumSource(HoodieRecordType.class) + @MethodSource("validRecordTypes") public void testCreateHoodieRecordsWithError(HoodieRecordType recordType) { Schema schema = new Schema.Parser().parse(SCHEMA_STRING); JavaRDD recordRdd = jsc.parallelize(Collections.singletonList(1)).map(i -> {