diff --git a/plugin/trino-hive/pom.xml b/plugin/trino-hive/pom.xml index de7b9a8f5c22..6e92c3e008e0 100644 --- a/plugin/trino-hive/pom.xml +++ b/plugin/trino-hive/pom.xml @@ -31,6 +31,12 @@ io.trino trino-parquet + + + org.apache.parquet + parquet-encoding + + @@ -225,6 +231,12 @@ alluxio-shaded-client + + org.apache.hudi + hudi-hadoop-mr + ${dep.hudi.version} + + org.apache.thrift libthrift diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java index 8ffc11d53a10..cd573064d5be 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java @@ -15,6 +15,8 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Stopwatch; +import com.google.common.base.Supplier; +import com.google.common.base.Suppliers; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; @@ -48,6 +50,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.common.ValidWriteIdList; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat; @@ -60,6 +63,9 @@ import org.apache.hadoop.mapred.JobConfigurable; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapreduce.MRConfig; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.HoodieROTablePathFilter; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; import java.io.BufferedReader; import java.io.IOException; @@ -181,6 +187,7 @@ public class BackgroundHiveSplitLoader // * if you hold a read lock but not a write lock, you can do any of the above three operations, but you may // see a series of operations involving two or more of the operations carried out half way. private final ReadWriteLock taskExecutionLock = new ReentrantReadWriteLock(); + private final Supplier hoodiePathFilterSupplier; private HiveSplitSource hiveSplitSource; private Stopwatch stopwatch; @@ -226,6 +233,7 @@ public BackgroundHiveSplitLoader( this.partitions = new ConcurrentLazyQueue<>(partitions); this.hdfsContext = new HdfsContext(session); this.validWriteIds = requireNonNull(validWriteIds, "validWriteIds is null"); + this.hoodiePathFilterSupplier = Suppliers.memoize(HoodieROTablePathFilter::new); } @Override @@ -378,6 +386,7 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) InputFormat inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path); boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition()); + PathFilter pathFilter = isHudiParquetInputFormat(inputFormat) ? hoodiePathFilterSupplier.get() : path1 -> true; // S3 Select pushdown works at the granularity of individual S3 objects, // therefore we must not split files when it is enabled. @@ -408,7 +417,7 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) partition.getTableToPartitionMapping(), getOnlyElement(parents), targetPaths, - splittable); + splittable, pathFilter); if (manifestFileIterator.isPresent()) { fileIterators.addLast(manifestFileIterator.get()); return COMPLETED_FUTURE; @@ -469,7 +478,7 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) // To support custom input formats, we want to call getSplits() // on the input format to obtain file splits. - if (shouldUseFileSplitsFromInputFormat(inputFormat)) { + if (!isHudiParquetInputFormat(inputFormat) && shouldUseFileSplitsFromInputFormat(inputFormat)) { if (tableBucketInfo.isPresent()) { throw new TrinoException(NOT_SUPPORTED, "Trino cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName()); } @@ -562,6 +571,7 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) acidInfoBuilder.setOrcAcidVersionValidated(true); // no ACID; no further validation needed readPaths = ImmutableList.of(path); } + // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping if (tableBucketInfo.isPresent()) { ListenableFuture lastResult = immediateVoidFuture(); // TODO document in addToQueue() that it is sufficient to hold on to last returned future @@ -569,7 +579,7 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) // list all files in the partition List files = new ArrayList<>(); try { - Iterators.addAll(files, new HiveFileIterator(table, readPath, fs, directoryLister, namenodeStats, FAIL, ignoreAbsentPartitions)); + Iterators.addAll(files, new HiveFileIterator(table, readPath, fs, directoryLister, namenodeStats, FAIL, ignoreAbsentPartitions, pathFilter)); } catch (HiveFileIterator.NestedDirectoryNotAllowedException e) { // Fail here to be on the safe side. This seems to be the same as what Hive does @@ -597,7 +607,7 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) for (Path readPath : readPaths) { Optional acidInfo = isFullAcid ? acidInfoBuilder.build() : Optional.empty(); - fileIterators.addLast(createInternalHiveSplitIterator(readPath, fs, splitFactory, splittable, acidInfo)); + fileIterators.addLast(createInternalHiveSplitIterator(readPath, fs, splitFactory, splittable, acidInfo, pathFilter)); } if (!fileStatusOriginalFiles.isEmpty()) { @@ -674,13 +684,15 @@ Optional> buildManifestFileIterator( TableToPartitionMapping tableToPartitionMapping, Path parent, List paths, - boolean splittable) + boolean splittable, + PathFilter pathFilter) throws IOException { FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, parent); Map fileStatuses = new HashMap<>(); - HiveFileIterator fileStatusIterator = new HiveFileIterator(table, parent, targetFilesystem, directoryLister, namenodeStats, IGNORED, false); + pathFilter = path1 -> true; + HiveFileIterator fileStatusIterator = new HiveFileIterator(table, parent, targetFilesystem, directoryLister, namenodeStats, IGNORED, false, pathFilter); fileStatusIterator.forEachRemaining(status -> fileStatuses.put(getPathWithoutSchemeAndAuthority(status.getPath()), status)); List locatedFileStatuses = new ArrayList<>(); @@ -757,6 +769,14 @@ private ListenableFuture addSplitsToSource(InputSplit[] targetSplits, Inte return lastResult; } + private static boolean isHudiParquetInputFormat(InputFormat inputFormat) + { + if (inputFormat instanceof HoodieParquetRealtimeInputFormat) { + return false; + } + return inputFormat instanceof HoodieParquetInputFormat; + } + private static boolean shouldUseFileSplitsFromInputFormat(InputFormat inputFormat) { return Arrays.stream(inputFormat.getClass().getAnnotations()) @@ -765,9 +785,9 @@ private static boolean shouldUseFileSplitsFromInputFormat(InputFormat inpu .anyMatch(name -> name.equals("UseFileSplitsFromInputFormat")); } - private Iterator createInternalHiveSplitIterator(Path path, FileSystem fileSystem, InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo) + private Iterator createInternalHiveSplitIterator(Path path, FileSystem fileSystem, InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo, PathFilter pathFilter) { - return Streams.stream(new HiveFileIterator(table, path, fileSystem, directoryLister, namenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED, ignoreAbsentPartitions)) + return Streams.stream(new HiveFileIterator(table, path, fileSystem, directoryLister, namenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED, ignoreAbsentPartitions, pathFilter)) .map(status -> splitFactory.createInternalHiveSplit(status, OptionalInt.empty(), splittable, acidInfo)) .filter(Optional::isPresent) .map(Optional::get) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursorProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursorProvider.java index 65fc0c2795a1..9345503d2fb8 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursorProvider.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursorProvider.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Properties; @@ -71,7 +72,8 @@ public Optional createRecordCursor( List columns, TupleDomain effectivePredicate, TypeManager typeManager, - boolean s3SelectPushdownEnabled) + boolean s3SelectPushdownEnabled, + final Map customSplitInfo) { configuration.setInt(LineRecordReader.MAX_LINE_LENGTH, textMaxLineLengthBytes); @@ -98,7 +100,8 @@ public Optional createRecordCursor( start, length, schema, - readerColumns); + readerColumns, + customSplitInfo); try { return new GenericHiveRecordCursor<>( diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java index 8666dfdb67e4..b4e1a2a2a5bf 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java @@ -195,7 +195,8 @@ public ConnectorPageSource createPageSource( hiveSplit.isS3SelectPushdownEnabled(), hiveSplit.getAcidInfo(), originalFile, - hiveTable.getTransaction()); + hiveTable.getTransaction(), + hiveSplit.getCustomSplitInfo()); if (pageSource.isPresent()) { ConnectorPageSource source = pageSource.get(); @@ -259,7 +260,8 @@ public static Optional createHivePageSource( boolean s3SelectPushdownEnabled, Optional acidInfo, boolean originalFile, - AcidTransaction transaction) + AcidTransaction transaction, + Map customSplitInfo) { if (effectivePredicate.isNone()) { return Optional.of(new EmptyPageSource()); @@ -333,7 +335,8 @@ public static Optional createHivePageSource( desiredColumns, effectivePredicate, typeManager, - s3SelectPushdownEnabled); + s3SelectPushdownEnabled, + customSplitInfo); if (readerWithProjections.isPresent()) { RecordCursor delegate = readerWithProjections.get().getRecordCursor(); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursorProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursorProvider.java index ca924cb27e48..b13f9dae9987 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursorProvider.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursorProvider.java @@ -21,6 +21,7 @@ import org.apache.hadoop.fs.Path; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Properties; @@ -39,7 +40,8 @@ Optional createRecordCursor( List columns, TupleDomain effectivePredicate, TypeManager typeManager, - boolean s3SelectPushdownEnabled); + boolean s3SelectPushdownEnabled, + Map customSplitInfo); /** * A wrapper class for diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java index 5bf3ae0d17ea..bc22c21625b5 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java @@ -22,6 +22,7 @@ import io.trino.spi.connector.ConnectorSplit; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.OptionalInt; @@ -55,6 +56,7 @@ public class HiveSplit private final boolean s3SelectPushdownEnabled; private final Optional acidInfo; private final long splitNumber; + private final Map customSplitInfo; @JsonCreator public HiveSplit( @@ -77,7 +79,8 @@ public HiveSplit( @JsonProperty("bucketValidation") Optional bucketValidation, @JsonProperty("s3SelectPushdownEnabled") boolean s3SelectPushdownEnabled, @JsonProperty("acidInfo") Optional acidInfo, - @JsonProperty("splitNumber") long splitNumber) + @JsonProperty("splitNumber") long splitNumber, + @JsonProperty("customSplitInfo") Map customSplitInfo) { checkArgument(start >= 0, "start must be positive"); checkArgument(length >= 0, "length must be positive"); @@ -115,6 +118,7 @@ public HiveSplit( this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; this.acidInfo = acidInfo; this.splitNumber = splitNumber; + this.customSplitInfo = ImmutableMap.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null")); } @JsonProperty @@ -244,6 +248,11 @@ public long getSplitNumber() return splitNumber; } + public Map getCustomSplitInfo() + { + return customSplitInfo; + } + @Override public Object getInfo() { diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java index 68c5758adcc8..683c2184178a 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java @@ -384,7 +384,8 @@ else if (maxSplitBytes * 2 >= remainingBlockBytes) { internalSplit.getBucketValidation(), internalSplit.isS3SelectPushdownEnabled(), internalSplit.getAcidInfo(), - numberOfProcessedSplits.getAndIncrement())); + numberOfProcessedSplits.getAndIncrement(), + internalSplit.getCustomSplitInfo())); internalSplit.increaseStart(splitBytes); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java index af4aa4509630..7607e5519db4 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java @@ -14,6 +14,7 @@ package io.trino.plugin.hive; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import io.trino.plugin.hive.HiveSplit.BucketConversion; import io.trino.plugin.hive.HiveSplit.BucketValidation; import io.trino.spi.HostAddress; @@ -22,6 +23,7 @@ import javax.annotation.concurrent.NotThreadSafe; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.OptionalInt; import java.util.Properties; @@ -60,6 +62,7 @@ public class InternalHiveSplit private final Optional bucketValidation; private final boolean s3SelectPushdownEnabled; private final Optional acidInfo; + private final Map customSplitInfo; private long start; private int currentBlockIndex; @@ -82,7 +85,8 @@ public InternalHiveSplit( Optional bucketConversion, Optional bucketValidation, boolean s3SelectPushdownEnabled, - Optional acidInfo) + Optional acidInfo, + Map customSplitInfo) { checkArgument(start >= 0, "start must be positive"); checkArgument(end >= 0, "length must be positive"); @@ -116,6 +120,7 @@ public InternalHiveSplit( this.bucketValidation = bucketValidation; this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; this.acidInfo = acidInfo; + this.customSplitInfo = ImmutableMap.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null")); } public String getPath() @@ -198,6 +203,11 @@ public Optional getBucketValidation() return bucketValidation; } + public Map getCustomSplitInfo() + { + return customSplitInfo; + } + public InternalHiveBlock currentBlock() { checkState(!isDone(), "All blocks have been consumed"); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java index cf312dd95cc9..b164ef0e1356 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java @@ -87,6 +87,7 @@ import static io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames; import static io.trino.plugin.hive.parquet.ParquetColumnIOConverter.constructField; import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName; +import static io.trino.plugin.hive.util.HiveUtil.shouldUseRecordReaderFromInputFormat; import static io.trino.spi.type.BigintType.BIGINT; import static java.lang.String.format; import static java.util.Objects.requireNonNull; @@ -147,7 +148,7 @@ public Optional createPageSource( boolean originalFile, AcidTransaction transaction) { - if (!PARQUET_SERDE_CLASS_NAMES.contains(getDeserializerClassName(schema))) { + if (!PARQUET_SERDE_CLASS_NAMES.contains(getDeserializerClassName(schema)) || shouldUseRecordReaderFromInputFormat(configuration, schema)) { return Optional.empty(); } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java index 09bd05549373..fa5dbccf5836 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Properties; import java.util.Set; @@ -67,7 +68,8 @@ public Optional createRecordCursor( List columns, TupleDomain effectivePredicate, TypeManager typeManager, - boolean s3SelectPushdownEnabled) + boolean s3SelectPushdownEnabled, + Map customSplitInfo) { if (!s3SelectPushdownEnabled) { return Optional.empty(); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CustomSplitConversionUtils.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CustomSplitConversionUtils.java new file mode 100644 index 000000000000..cdcb43089421 --- /dev/null +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CustomSplitConversionUtils.java @@ -0,0 +1,66 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.spi.TrinoException; +import org.apache.hadoop.mapred.FileSplit; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; + +/** + * Utility class for both extracting customSplitInfo Map from a custom FileSplit and transforming the customSplitInfo back into a FileSplit. + */ +public class CustomSplitConversionUtils +{ + private static final List converters = ImmutableList.of(new HudiRealtimeSplitConverter()); + + private CustomSplitConversionUtils() + { + } + + public static Map extractCustomSplitInfo(FileSplit split) + { + for (CustomSplitConverter converter : converters) { + Optional> customSplitData = converter.extractCustomSplitInfo(split); + if (customSplitData.isPresent()) { + return customSplitData.get(); + } + } + return ImmutableMap.of(); + } + + public static FileSplit recreateSplitWithCustomInfo(FileSplit split, Map customSplitInfo) + { + for (CustomSplitConverter converter : converters) { + Optional fileSplit; + try { + fileSplit = converter.recreateFileSplitWithCustomInfo(split, customSplitInfo); + } + catch (IOException e) { + throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, String.format("Split converter %s failed to create FileSplit.", converter.getClass()), e); + } + if (fileSplit.isPresent()) { + return fileSplit.get(); + } + } + return split; + } +} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CustomSplitConverter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CustomSplitConverter.java new file mode 100644 index 000000000000..dba4f3df926f --- /dev/null +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CustomSplitConverter.java @@ -0,0 +1,39 @@ + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hive.util; + +import org.apache.hadoop.mapred.FileSplit; + +import java.io.IOException; +import java.util.Map; +import java.util.Optional; + +/** + * Interface for Split specific implementation of conversion from Split -> customSplitInfo Map and back. + */ +public interface CustomSplitConverter +{ + /** + * This method is expected to return optional.empty() if the FileSplit does not match the split converter. + */ + Optional> extractCustomSplitInfo(FileSplit split); + + /** + * This method is expected to merge the customSplitInfo with split to recreate the custom FileSplit. + * It is expected to return optional.empty() if the customSplitInfo does not match the split converter. + */ + Optional recreateFileSplitWithCustomInfo(FileSplit split, Map customSplitInfo) throws IOException; +} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveFileIterator.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveFileIterator.java index de577035be09..58af237644d8 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveFileIterator.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveFileIterator.java @@ -14,6 +14,7 @@ package io.trino.plugin.hive.util; import com.google.common.collect.AbstractIterator; +import com.google.common.collect.Iterators; import io.airlift.stats.TimeStat; import io.trino.plugin.hive.DirectoryLister; import io.trino.plugin.hive.NamenodeStats; @@ -22,6 +23,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.RemoteIterator; import java.io.FileNotFoundException; @@ -52,6 +54,7 @@ public enum NestedDirectoryPolicy private final NamenodeStats namenodeStats; private final NestedDirectoryPolicy nestedDirectoryPolicy; private final boolean ignoreAbsentPartitions; + private final PathFilter pathFilter; private Iterator remoteIterator = emptyIterator(); @@ -62,7 +65,8 @@ public HiveFileIterator( DirectoryLister directoryLister, NamenodeStats namenodeStats, NestedDirectoryPolicy nestedDirectoryPolicy, - boolean ignoreAbsentPartitions) + boolean ignoreAbsentPartitions, + PathFilter pathfilter) { paths.addLast(requireNonNull(path, "path is null")); this.table = requireNonNull(table, "table is null"); @@ -71,6 +75,7 @@ public HiveFileIterator( this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null"); this.nestedDirectoryPolicy = requireNonNull(nestedDirectoryPolicy, "nestedDirectoryPolicy is null"); this.ignoreAbsentPartitions = ignoreAbsentPartitions; + this.pathFilter = requireNonNull(pathfilter, "pathFilter is null"); } @Override @@ -104,17 +109,17 @@ protected LocatedFileStatus computeNext() if (paths.isEmpty()) { return endOfData(); } - remoteIterator = getLocatedFileStatusRemoteIterator(paths.removeFirst()); + remoteIterator = getLocatedFileStatusRemoteIterator(paths.removeFirst(), pathFilter); } } - private Iterator getLocatedFileStatusRemoteIterator(Path path) + private Iterator getLocatedFileStatusRemoteIterator(Path path, PathFilter pathFilter) { try (TimeStat.BlockTimer ignored = namenodeStats.getListLocatedStatus().time()) { if (ignoreAbsentPartitions && !exists(path)) { return emptyIterator(); } - return new FileStatusIterator(table, path, fileSystem, directoryLister, namenodeStats); + return Iterators.filter(new FileStatusIterator(table, path, fileSystem, directoryLister, namenodeStats), input -> pathFilter.accept(input.getPath())); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java index f063d3d49075..b5420daef38f 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java @@ -14,6 +14,7 @@ package io.trino.plugin.hive.util; import com.google.common.base.Joiner; +import com.google.common.base.Predicate; import com.google.common.base.Splitter; import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableList; @@ -75,6 +76,7 @@ import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.DateTimeFormatterBuilder; @@ -85,12 +87,15 @@ import javax.annotation.Nullable; import java.io.IOException; +import java.lang.annotation.Annotation; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.math.BigDecimal; import java.math.BigInteger; +import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.OptionalInt; import java.util.Properties; @@ -102,6 +107,7 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.Lists.newArrayList; +import static com.google.common.collect.Lists.transform; import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; import static io.trino.plugin.hive.HiveColumnHandle.bucketColumnHandle; @@ -131,6 +137,7 @@ import static io.trino.plugin.hive.metastore.SortingColumn.Order.DESCENDING; import static io.trino.plugin.hive.util.ConfigurationUtils.copy; import static io.trino.plugin.hive.util.ConfigurationUtils.toJobConf; +import static io.trino.plugin.hive.util.CustomSplitConversionUtils.recreateSplitWithCustomInfo; import static io.trino.plugin.hive.util.HiveBucketing.bucketedOnTimestamp; import static io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; @@ -165,6 +172,7 @@ import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS; import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR; +import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR; import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; public final class HiveUtil @@ -211,7 +219,7 @@ private HiveUtil() { } - public static RecordReader createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List columns) + public static RecordReader createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List columns, Map customSplitInfo) { // determine which hive columns we will read List readColumns = columns.stream() @@ -228,15 +236,27 @@ private HiveUtil() // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files configuration = copy(configuration); setReadColumns(configuration, readHiveColumnIndexes); + // Only propagate serialization schema configs by default + Predicate schemaFilter = schemaProperty -> schemaProperty.startsWith("serialization."); InputFormat inputFormat = getInputFormat(configuration, schema, true); JobConf jobConf = toJobConf(configuration); FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null); - // propagate serialization configuration to getRecordReader + if (!customSplitInfo.isEmpty() && isHudiRealtimeSplit(customSplitInfo)) { + fileSplit = recreateSplitWithCustomInfo(fileSplit, customSplitInfo); + + // Add additional column information for record reader + List readHiveColumnNames = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getName)); + jobConf.set(READ_COLUMN_NAMES_CONF_STR, Joiner.on(',').join(readHiveColumnNames)); + + // Remove filter when using customSplitInfo as the record reader requires complete schema configs + schemaFilter = schemaProperty -> true; + } + schema.stringPropertyNames().stream() - .filter(name -> name.startsWith("serialization.")) - .forEach(name -> jobConf.set(name, schema.getProperty(name))); + .filter(schemaFilter) + .forEach(name -> jobConf.set(name, schema.getProperty(name))); configureCompressionCodecs(jobConf); @@ -271,12 +291,27 @@ private HiveUtil() } } + private static boolean isHudiRealtimeSplit(Map customSplitInfo) + { + String customSplitClass = customSplitInfo.get(HudiRealtimeSplitConverter.CUSTOM_SPLIT_CLASS_KEY); + return HoodieRealtimeFileSplit.class.getName().equals(customSplitClass); + } + public static void setReadColumns(Configuration configuration, List readHiveColumnIndexes) { configuration.set(READ_COLUMN_IDS_CONF_STR, Joiner.on(',').join(readHiveColumnIndexes)); configuration.setBoolean(READ_ALL_COLUMNS, false); } + public static boolean shouldUseRecordReaderFromInputFormat(Configuration configuration, Properties schema) + { + InputFormat inputFormat = HiveUtil.getInputFormat(configuration, schema, false); + return Arrays.stream(inputFormat.getClass().getAnnotations()) + .map(Annotation::annotationType) + .map(Class::getSimpleName) + .anyMatch(name -> name.equals("UseRecordReaderFromInputFormat")); + } + private static void configureCompressionCodecs(JobConf jobConf) { // add Airlift LZO and LZOP to head of codecs list so as to not override existing entries diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HudiRealtimeSplitConverter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HudiRealtimeSplitConverter.java new file mode 100644 index 000000000000..251c45811827 --- /dev/null +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HudiRealtimeSplitConverter.java @@ -0,0 +1,72 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.hive.util; + +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +/** + * HoodieRealtimeFileSplit specific implementation of CustomSplitConverter. + * Extracts customSplitInfo from HoodieRealtimeFileSplit and reconstructs HoodieRealtimeFileSplit from Map. + */ +public class HudiRealtimeSplitConverter + implements CustomSplitConverter +{ + public static final String CUSTOM_SPLIT_CLASS_KEY = "custom_split_class"; + private static final String HUDI_DELTA_FILEPATHS_KEY = "hudi_delta_filepaths"; + private static final String HUDI_BASEPATH_KEY = "hudi_basepath"; + private static final String HUDI_MAX_COMMIT_TIME_KEY = "hudi_max_commit_time"; + + @Override + public Optional> extractCustomSplitInfo(FileSplit split) + { + if (split instanceof HoodieRealtimeFileSplit) { + HoodieRealtimeFileSplit hudiSplit = (HoodieRealtimeFileSplit) split; + Map customSplitInfo = ImmutableMap.builder() + .put(CUSTOM_SPLIT_CLASS_KEY, HoodieRealtimeFileSplit.class.getName()) + .put(HUDI_DELTA_FILEPATHS_KEY, String.join(",", hudiSplit.getDeltaLogPaths())) + .put(HUDI_BASEPATH_KEY, hudiSplit.getBasePath()) + .put(HUDI_MAX_COMMIT_TIME_KEY, hudiSplit.getMaxCommitTime()) + .build(); + return Optional.of(customSplitInfo); + } + return Optional.empty(); + } + + @Override + public Optional recreateFileSplitWithCustomInfo(FileSplit split, Map customSplitInfo) throws IOException + { + String customSplitClass = customSplitInfo.get(CUSTOM_SPLIT_CLASS_KEY); + if (HoodieRealtimeFileSplit.class.getName().equals(customSplitClass)) { + requireNonNull(customSplitInfo.get(HUDI_DELTA_FILEPATHS_KEY), "HUDI_DELTA_FILEPATHS_KEY is missing"); + List deltaLogPaths = Arrays.asList(customSplitInfo.get(HUDI_DELTA_FILEPATHS_KEY).split(",")); + return Optional.of(new HoodieRealtimeFileSplit( + split, + requireNonNull(customSplitInfo.get(HUDI_BASEPATH_KEY), "HUDI_BASEPATH_KEY is missing"), + deltaLogPaths, + requireNonNull(customSplitInfo.get(HUDI_MAX_COMMIT_TIME_KEY), "HUDI_MAX_COMMIT_TIME_KEY is missing"))); + } + return Optional.empty(); + } +} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java index 94389202f81d..4b5fd1dc245c 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java @@ -14,6 +14,7 @@ package io.trino.plugin.hive.util; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import io.airlift.units.DataSize; import io.trino.plugin.hive.AcidInfo; import io.trino.plugin.hive.HiveColumnHandle; @@ -53,6 +54,7 @@ import static com.google.common.collect.ImmutableList.toImmutableList; import static io.airlift.slice.Slices.utf8Slice; import static io.trino.plugin.hive.HiveColumnHandle.isPathColumnHandle; +import static io.trino.plugin.hive.util.CustomSplitConversionUtils.extractCustomSplitInfo; import static io.trino.plugin.hive.util.HiveUtil.isSplittable; import static java.util.Objects.requireNonNull; @@ -126,13 +128,15 @@ public Optional createInternalHiveSplit(LocatedFileStatus sta status.getModificationTime(), bucketNumber, splittable, - acidInfo); + acidInfo, + ImmutableMap.of()); } public Optional createInternalHiveSplit(FileSplit split) throws IOException { FileStatus file = fileSystem.getFileStatus(split.getPath()); + Map customSplitInfo = extractCustomSplitInfo(split); return createInternalHiveSplit( split.getPath(), fileSystem.getFileBlockLocations(file, split.getStart(), split.getLength()), @@ -142,7 +146,8 @@ public Optional createInternalHiveSplit(FileSplit split) file.getModificationTime(), OptionalInt.empty(), false, - Optional.empty()); + Optional.empty(), + customSplitInfo); } private Optional createInternalHiveSplit( @@ -154,7 +159,8 @@ private Optional createInternalHiveSplit( long fileModificationTime, OptionalInt bucketNumber, boolean splittable, - Optional acidInfo) + Optional acidInfo, + Map customSplitInfo) { String pathString = path.toString(); if (!pathMatchesPredicate(pathDomain, pathString)) { @@ -226,7 +232,8 @@ private Optional createInternalHiveSplit( bucketConversion, bucketValidation, s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path), - acidInfo)); + acidInfo, + customSplitInfo)); } private static void checkBlocks(Path path, List blocks, long start, long length) diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java index 554a5a342c9a..595d552c8141 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java @@ -874,7 +874,8 @@ public void testBuildManifestFileIterator() TableToPartitionMapping.empty(), new Path("hdfs://VOL1:9000/db_name/table_name"), paths, - true); + true, + null); assertTrue(splitIterator.isPresent()); List splits = ImmutableList.copyOf(splitIterator.get()); assertEquals(splits.size(), 2); @@ -912,7 +913,8 @@ public void testBuildManifestFileIteratorNestedDirectory() TableToPartitionMapping.empty(), new Path("hdfs://VOL1:9000/db_name/table_name"), paths, - false); + false, + null); assertTrue(splitIterator.isEmpty()); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java index e134c0e10948..9795a72d2a75 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java @@ -14,6 +14,7 @@ package io.trino.plugin.hive; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import io.airlift.compress.lzo.LzoCodec; @@ -945,7 +946,8 @@ private ConnectorPageSource createPageSourceFromCursorProvider( false, Optional.empty(), false, - NO_ACID_TRANSACTION); + NO_ACID_TRANSACTION, + ImmutableMap.of()); return pageSource.get(); } @@ -1015,7 +1017,8 @@ private void testPageSourceFactory( false, Optional.empty(), false, - NO_ACID_TRANSACTION); + NO_ACID_TRANSACTION, + ImmutableMap.of()); assertTrue(pageSource.isPresent()); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java index 619e7f33f804..313d4407b29d 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java @@ -242,7 +242,8 @@ private static ConnectorPageSource createPageSource(HiveTransactionHandle transa Optional.empty(), false, Optional.empty(), - 0); + 0, + ImmutableMap.of()); ConnectorTableHandle table = new HiveTableHandle(SCHEMA_NAME, TABLE_NAME, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty()); HivePageSourceProvider provider = new HivePageSourceProvider( TYPE_MANAGER, diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java index e6712b48d0c3..2e9ce8da4900 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java @@ -27,6 +27,7 @@ import org.testng.annotations.Test; import java.time.Instant; +import java.util.Map; import java.util.Optional; import java.util.OptionalInt; import java.util.Properties; @@ -57,6 +58,7 @@ public void testJsonRoundTrip() acidInfoBuilder.addDeleteDelta(new Path("file:///data/fullacid/delete_delta_0000004_0000004_0000")); acidInfoBuilder.addDeleteDelta(new Path("file:///data/fullacid/delete_delta_0000007_0000007_0000")); AcidInfo acidInfo = acidInfoBuilder.build().get(); + Map customSplitInfo = ImmutableMap.of("key", "value"); HiveSplit expected = new HiveSplit( "db", @@ -82,7 +84,8 @@ public void testJsonRoundTrip() Optional.empty(), false, Optional.of(acidInfo), - 555534); + 555534, + customSplitInfo); String json = codec.toJson(expected); HiveSplit actual = codec.fromJson(json); @@ -104,5 +107,6 @@ public void testJsonRoundTrip() assertEquals(actual.isS3SelectPushdownEnabled(), expected.isS3SelectPushdownEnabled()); assertEquals(actual.getAcidInfo().get(), expected.getAcidInfo().get()); assertEquals(actual.getSplitNumber(), expected.getSplitNumber()); + assertEquals(actual.getCustomSplitInfo(), expected.getCustomSplitInfo()); } } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java index 381d8f62a270..c9cc38146086 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java @@ -14,6 +14,7 @@ package io.trino.plugin.hive; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.util.concurrent.SettableFuture; import io.airlift.stats.CounterStat; import io.airlift.units.DataSize; @@ -354,7 +355,8 @@ private TestSplit(int id, OptionalInt bucketNumber, DataSize fileSize) Optional.empty(), Optional.empty(), false, - Optional.empty()); + Optional.empty(), + ImmutableMap.of()); } private static Properties properties(String key, String value) diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java index 03d953096e1d..e8c36b4cb63a 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java @@ -107,7 +107,8 @@ private static ConnectorPageSource createTestingPageSource(HiveTransactionHandle Optional.empty(), false, Optional.empty(), - 0); + 0, + ImmutableMap.of()); TableHandle tableHandle = new TableHandle( new CatalogName(HIVE_CATALOG_NAME), diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java index aadc450cb5cc..4a5c7a00f970 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java @@ -14,6 +14,7 @@ package io.trino.plugin.hive; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import io.airlift.slice.Slice; import io.airlift.stats.Distribution; @@ -574,7 +575,8 @@ public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats, Connec false, Optional.empty(), false, - NO_ACID_TRANSACTION) + NO_ACID_TRANSACTION, + ImmutableMap.of()) .orElseThrow(); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java index cf48e225641e..a7c5abfd8539 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java @@ -153,8 +153,8 @@ public ConnectorPageSource createGenericReader( Optional.empty(), false, Optional.empty(), - 0); - + 0, + ImmutableMap.of()); return factory.createPageSource( TestingConnectorTransactionHandle.INSTANCE, session, split, @@ -192,7 +192,8 @@ static ConnectorPageSource createPageSource( readColumns, TupleDomain.all(), TYPE_MANAGER, - false); + false, + ImmutableMap.of()); checkState(recordCursorWithProjections.isPresent(), "readerPageSourceWithProjections is not present"); checkState(recordCursorWithProjections.get().getProjectedReaderColumns().isEmpty(), "projection should not be required"); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java index 549066533c6e..b59a6cab46ac 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java @@ -227,7 +227,8 @@ private ConnectorPageSource createPageSource( false, Optional.empty(), false, - NO_ACID_TRANSACTION); + NO_ACID_TRANSACTION, + ImmutableMap.of()); assertTrue(pageSource.isPresent()); return pageSource.get(); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java index 05c259bf8fd7..1ba2ef264afe 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java @@ -49,6 +49,7 @@ import static io.trino.spi.type.TimestampType.createTimestampType; import static io.trino.spi.type.TimestampWithTimeZoneType.createTimestampWithTimeZoneType; import static io.trino.testing.MaterializedResult.materializeSourceDataStream; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; import static org.assertj.core.api.Assertions.assertThat; @@ -106,6 +107,7 @@ private ConnectorPageSource createPageSource(ConnectorSession session, File parq HivePageSourceFactory pageSourceFactory = StandardFileFormats.TRINO_PARQUET.getHivePageSourceFactory(HDFS_ENVIRONMENT).orElseThrow(); Properties schema = new Properties(); + schema.setProperty(FILE_INPUT_FORMAT, HiveStorageFormat.PARQUET.getInputFormat()); schema.setProperty(SERIALIZATION_LIB, HiveStorageFormat.PARQUET.getSerDe()); ReaderPageSource pageSourceWithProjections = pageSourceFactory.createPageSource( diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java index bdbe9e987bf0..5435fdbe6f70 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java @@ -38,6 +38,7 @@ import static io.trino.plugin.hive.util.HiveUtil.getDeserializer; import static io.trino.plugin.hive.util.HiveUtil.getInputFormat; import static io.trino.plugin.hive.util.HiveUtil.parseHiveTimestamp; +import static io.trino.plugin.hive.util.HiveUtil.shouldUseRecordReaderFromInputFormat; import static io.trino.plugin.hive.util.HiveUtil.toPartitionValues; import static io.trino.type.DateTimes.MICROSECONDS_PER_MILLISECOND; import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; @@ -45,6 +46,8 @@ import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT; import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; public class TestHiveUtil { @@ -110,6 +113,18 @@ public void testGetInputFormat() assertInstanceOf(getInputFormat(configuration, legacyParquetSchema, true), MapredParquetInputFormat.class); } + @Test + public void testShouldUseRecordReaderFromInputFormat() + { + Properties parquetSchema = new Properties(); + parquetSchema.setProperty(FILE_INPUT_FORMAT, "parquet.hive.MapredParquetInputFormat"); + assertFalse(shouldUseRecordReaderFromInputFormat(new Configuration(false), parquetSchema)); + + Properties realtimeSchema = new Properties(); + realtimeSchema.setProperty(FILE_INPUT_FORMAT, "org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat"); + assertTrue(shouldUseRecordReaderFromInputFormat(new Configuration(false), realtimeSchema)); + } + private static void assertToPartitionValues(String partitionName) throws MetaException { diff --git a/pom.xml b/pom.xml index badf01837808..62895a0da7ef 100644 --- a/pom.xml +++ b/pom.xml @@ -64,6 +64,7 @@ 3.2.8 1.0.60 5.5.2 + 0.7.0