diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/reader/TrinoColumnIndexStore.java b/lib/trino-parquet/src/main/java/io/trino/parquet/reader/TrinoColumnIndexStore.java index 041c4e1250ae..94c6f753637d 100644 --- a/lib/trino-parquet/src/main/java/io/trino/parquet/reader/TrinoColumnIndexStore.java +++ b/lib/trino-parquet/src/main/java/io/trino/parquet/reader/TrinoColumnIndexStore.java @@ -18,7 +18,11 @@ import com.google.common.collect.ListMultimap; import io.trino.parquet.DiskRange; import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.TupleDomain; import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.format.Util; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.metadata.BlockMetaData; @@ -32,12 +36,15 @@ import java.io.IOException; import java.io.InputStream; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.function.BiFunction; import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; import static java.util.Objects.requireNonNull; @@ -129,6 +136,43 @@ public OffsetIndex getOffsetIndex(ColumnPath column) return offsetIndexStore.get(column); } + public static Optional getColumnIndexStore( + ParquetDataSource dataSource, + BlockMetaData blockMetadata, + Map, ColumnDescriptor> descriptorsByPath, + TupleDomain parquetTupleDomain, + ParquetReaderOptions options) + { + if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) { + return Optional.empty(); + } + + boolean hasColumnIndex = false; + for (ColumnChunkMetaData column : blockMetadata.getColumns()) { + if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) { + hasColumnIndex = true; + break; + } + } + + if (!hasColumnIndex) { + return Optional.empty(); + } + + Set columnsReadPaths = new HashSet<>(descriptorsByPath.size()); + for (List path : descriptorsByPath.keySet()) { + columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0]))); + } + + Map parquetDomains = parquetTupleDomain.getDomains() + .orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains")); + Set columnsFilteredPaths = parquetDomains.keySet().stream() + .map(column -> ColumnPath.get(column.getPath())) + .collect(toImmutableSet()); + + return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths)); + } + private static Map loadIndexes( ParquetDataSource dataSource, List indexMetadata, diff --git a/plugin/trino-hive/pom.xml b/plugin/trino-hive/pom.xml index f844ae68cb26..3d37acc7756a 100644 --- a/plugin/trino-hive/pom.xml +++ b/plugin/trino-hive/pom.xml @@ -221,11 +221,6 @@ parquet-column - - org.apache.parquet - parquet-common - - org.apache.parquet parquet-format-structures @@ -299,6 +294,12 @@ runtime + + org.apache.parquet + parquet-common + runtime + + io.airlift junit-extensions diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java index f7919c4da1bf..4482eb3cd090 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java @@ -33,7 +33,6 @@ import io.trino.parquet.predicate.TupleDomainParquetPredicate; import io.trino.parquet.reader.MetadataReader; import io.trino.parquet.reader.ParquetReader; -import io.trino.parquet.reader.TrinoColumnIndexStore; import io.trino.plugin.hive.AcidInfo; import io.trino.plugin.hive.FileFormatDataSourceStats; import io.trino.plugin.hive.HiveColumnHandle; @@ -51,8 +50,6 @@ import io.trino.spi.predicate.TupleDomain; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; @@ -63,7 +60,6 @@ import org.joda.time.DateTimeZone; import java.io.IOException; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -74,7 +70,6 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; -import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; import static io.trino.parquet.BloomFilterStore.getBloomFilterStore; import static io.trino.parquet.ParquetTypeUtils.constructField; @@ -84,6 +79,7 @@ import static io.trino.parquet.ParquetTypeUtils.lookupColumnByName; import static io.trino.parquet.predicate.PredicateUtils.buildPredicate; import static io.trino.parquet.predicate.PredicateUtils.predicateMatches; +import static io.trino.parquet.reader.TrinoColumnIndexStore.getColumnIndexStore; import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; import static io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; @@ -383,43 +379,6 @@ public static Optional getColumnType(HiveColumnH return Optional.of(new GroupType(baseType.getRepetition(), baseType.getName(), ImmutableList.of(type))); } - public static Optional getColumnIndexStore( - ParquetDataSource dataSource, - BlockMetaData blockMetadata, - Map, ColumnDescriptor> descriptorsByPath, - TupleDomain parquetTupleDomain, - ParquetReaderOptions options) - { - if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) { - return Optional.empty(); - } - - boolean hasColumnIndex = false; - for (ColumnChunkMetaData column : blockMetadata.getColumns()) { - if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) { - hasColumnIndex = true; - break; - } - } - - if (!hasColumnIndex) { - return Optional.empty(); - } - - Set columnsReadPaths = new HashSet<>(descriptorsByPath.size()); - for (List path : descriptorsByPath.keySet()) { - columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0]))); - } - - Map parquetDomains = parquetTupleDomain.getDomains() - .orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains")); - Set columnsFilteredPaths = parquetDomains.keySet().stream() - .map(column -> ColumnPath.get(column.getPath())) - .collect(toImmutableSet()); - - return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths)); - } - public static TupleDomain getParquetTupleDomain( Map, ColumnDescriptor> descriptorsByPath, TupleDomain effectivePredicate, diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java index 93b0e3c5af02..9c29179861c7 100644 --- a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java +++ b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java @@ -75,11 +75,11 @@ import static io.trino.parquet.ParquetTypeUtils.getDescriptors; import static io.trino.parquet.predicate.PredicateUtils.buildPredicate; import static io.trino.parquet.predicate.PredicateUtils.predicateMatches; +import static io.trino.parquet.reader.TrinoColumnIndexStore.getColumnIndexStore; import static io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns; import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.ParquetReaderProvider; import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.createDataSource; import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.createParquetPageSource; -import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getColumnIndexStore; import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getParquetMessageType; import static io.trino.plugin.hive.parquet.ParquetPageSourceFactory.getParquetTupleDomain; import static io.trino.plugin.hive.util.HiveUtil.makePartName;