Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import io.airlift.slice.XxHash64;
import io.trino.parquet.reader.RowGroupInfo;
import io.trino.spi.Page;
import io.trino.spi.block.Block;
import io.trino.spi.type.Type;
Expand Down Expand Up @@ -126,17 +127,17 @@ public void validateColumns(ParquetDataSourceId dataSourceId, MessageType schema
}
}

public void validateBlocksMetadata(ParquetDataSourceId dataSourceId, List<BlockMetaData> blocksMetaData)
public void validateBlocksMetadata(ParquetDataSourceId dataSourceId, List<RowGroupInfo> rowGroupInfos)
throws ParquetCorruptionException
{
validateParquet(
blocksMetaData.size() == rowGroups.size(),
rowGroupInfos.size() == rowGroups.size(),
dataSourceId,
"Number of row groups %d did not match %d",
blocksMetaData.size(),
rowGroupInfos.size(),
rowGroups.size());
for (int rowGroupIndex = 0; rowGroupIndex < blocksMetaData.size(); rowGroupIndex++) {
BlockMetaData block = blocksMetaData.get(rowGroupIndex);
for (int rowGroupIndex = 0; rowGroupIndex < rowGroupInfos.size(); rowGroupIndex++) {
BlockMetaData block = rowGroupInfos.get(rowGroupIndex).blockMetaData();
RowGroup rowGroup = rowGroups.get(rowGroupIndex);
validateParquet(
block.getRowCount() == rowGroup.getNum_rows(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import io.trino.parquet.ParquetDataSource;
import io.trino.parquet.ParquetDataSourceId;
import io.trino.parquet.ParquetEncoding;
import io.trino.parquet.ParquetReaderOptions;
import io.trino.parquet.reader.RowGroupInfo;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.Type;
Expand Down Expand Up @@ -50,9 +52,11 @@
import java.util.Optional;
import java.util.Set;

import static io.trino.parquet.BloomFilterStore.getBloomFilterStore;
import static io.trino.parquet.ParquetCompressionUtils.decompress;
import static io.trino.parquet.ParquetReaderUtils.isOnlyDictionaryEncodingPages;
import static io.trino.parquet.ParquetTypeUtils.getParquetEncoding;
import static io.trino.parquet.reader.TrinoColumnIndexStore.getColumnIndexStore;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.IntegerType.INTEGER;
Expand Down Expand Up @@ -172,6 +176,50 @@ public static boolean predicateMatches(
columnIndexStore);
}

public static List<RowGroupInfo> getFilteredRowGroups(
long splitStart,
long splitLength,
ParquetDataSource dataSource,
List<BlockMetaData> blocksMetaData,
List<TupleDomain<ColumnDescriptor>> parquetTupleDomains,
List<TupleDomainParquetPredicate> parquetPredicates,
Map<List<String>, ColumnDescriptor> descriptorsByPath,
DateTimeZone timeZone,
int domainCompactionThreshold,
ParquetReaderOptions options)
throws IOException
{
long fileRowCount = 0;
ImmutableList.Builder<RowGroupInfo> rowGroupInfoBuilder = ImmutableList.builder();
for (BlockMetaData block : blocksMetaData) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
boolean splitContainsBlock = splitStart <= firstDataPage && firstDataPage < splitStart + splitLength;
if (splitContainsBlock) {
for (int i = 0; i < parquetTupleDomains.size(); i++) {
TupleDomain<ColumnDescriptor> parquetTupleDomain = parquetTupleDomains.get(i);
TupleDomainParquetPredicate parquetPredicate = parquetPredicates.get(i);
Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
Optional<BloomFilterStore> bloomFilterStore = getBloomFilterStore(dataSource, block, parquetTupleDomain, options);
if (predicateMatches(
parquetPredicate,
block,
dataSource,
descriptorsByPath,
parquetTupleDomain,
columnIndex,
bloomFilterStore,
timeZone,
domainCompactionThreshold)) {
rowGroupInfoBuilder.add(new RowGroupInfo(block, fileRowCount, columnIndex));
break;
}
}
}
fileRowCount += block.getRowCount();
}
return rowGroupInfoBuilder.build();
}

private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, ColumnDescriptor> descriptorsByPath)
{
ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@
import static java.lang.Math.min;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
import static java.util.Collections.nCopies;
import static java.util.Objects.requireNonNull;

public class ParquetReader
Expand All @@ -94,8 +93,7 @@ public class ParquetReader
public static final String COLUMN_INDEX_ROWS_FILTERED = "ParquetColumnIndexRowsFiltered";

private final Optional<String> fileCreatedBy;
private final List<BlockMetaData> blocks;
private final List<Long> firstRowsOfBlocks;
private final List<RowGroupInfo> rowGroups;
private final List<Column> columnFields;
private final List<PrimitiveField> primitiveFields;
private final ParquetDataSource dataSource;
Expand Down Expand Up @@ -123,7 +121,6 @@ public class ParquetReader

private AggregatedMemoryContext currentRowGroupMemoryContext;
private final Map<ChunkKey, ChunkedInputStream> chunkReaders;
private final List<Optional<ColumnIndexStore>> columnIndexStore;
private final Optional<ParquetWriteValidation> writeValidation;
private final Optional<WriteChecksumBuilder> writeChecksumBuilder;
private final Optional<StatisticsValidation> rowGroupStatisticsValidation;
Expand All @@ -136,39 +133,21 @@ public class ParquetReader
public ParquetReader(
Optional<String> fileCreatedBy,
List<Column> columnFields,
List<BlockMetaData> blocks,
List<Long> firstRowsOfBlocks,
ParquetDataSource dataSource,
DateTimeZone timeZone,
AggregatedMemoryContext memoryContext,
ParquetReaderOptions options,
Function<Exception, RuntimeException> exceptionTransform)
throws IOException
{
this(fileCreatedBy, columnFields, blocks, firstRowsOfBlocks, dataSource, timeZone, memoryContext, options, exceptionTransform, Optional.empty(), nCopies(blocks.size(), Optional.empty()), Optional.empty());
}

public ParquetReader(
Optional<String> fileCreatedBy,
List<Column> columnFields,
List<BlockMetaData> blocks,
List<Long> firstRowsOfBlocks,
List<RowGroupInfo> rowGroups,
ParquetDataSource dataSource,
DateTimeZone timeZone,
AggregatedMemoryContext memoryContext,
ParquetReaderOptions options,
Function<Exception, RuntimeException> exceptionTransform,
Optional<TupleDomainParquetPredicate> parquetPredicate,
List<Optional<ColumnIndexStore>> columnIndexStore,
Optional<ParquetWriteValidation> writeValidation)
throws IOException
{
this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null");
requireNonNull(columnFields, "columnFields is null");
this.columnFields = ImmutableList.copyOf(columnFields);
this.primitiveFields = getPrimitiveFields(columnFields.stream().map(Column::field).collect(toImmutableList()));
this.blocks = requireNonNull(blocks, "blocks is null");
this.firstRowsOfBlocks = requireNonNull(firstRowsOfBlocks, "firstRowsOfBlocks is null");
this.rowGroups = requireNonNull(rowGroups, "rowGroups is null");
this.dataSource = requireNonNull(dataSource, "dataSource is null");
this.columnReaderFactory = new ColumnReaderFactory(timeZone);
this.memoryContext = requireNonNull(memoryContext, "memoryContext is null");
Expand All @@ -178,31 +157,28 @@ public ParquetReader(
this.columnReaders = new HashMap<>();
this.maxBytesPerCell = new HashMap<>();

checkArgument(blocks.size() == firstRowsOfBlocks.size(), "elements of firstRowsOfBlocks must correspond to blocks");

this.writeValidation = requireNonNull(writeValidation, "writeValidation is null");
validateWrite(
validation -> fileCreatedBy.equals(Optional.of(validation.getCreatedBy())),
"Expected created by %s, found %s",
writeValidation.map(ParquetWriteValidation::getCreatedBy),
fileCreatedBy);
validateBlockMetadata(blocks);
validateBlockMetadata(rowGroups);
this.writeChecksumBuilder = writeValidation.map(validation -> createWriteChecksumBuilder(validation.getTypes()));
this.rowGroupStatisticsValidation = writeValidation.map(validation -> createStatisticsValidationBuilder(validation.getTypes()));

requireNonNull(parquetPredicate, "parquetPredicate is null");
this.columnIndexStore = requireNonNull(columnIndexStore, "columnIndexStore is null");
Optional<FilterPredicate> filter = Optional.empty();
if (parquetPredicate.isPresent() && options.isUseColumnIndex()) {
filter = parquetPredicate.get().toParquetFilter(timeZone);
}
this.blockRowRanges = calculateFilteredRowRanges(blocks, filter, columnIndexStore, primitiveFields);
this.blockRowRanges = calculateFilteredRowRanges(rowGroups, filter, primitiveFields);

this.blockFactory = new ParquetBlockFactory(exceptionTransform);
ListMultimap<ChunkKey, DiskRange> ranges = ArrayListMultimap.create();
Map<String, LongCount> codecMetrics = new HashMap<>();
for (int rowGroup = 0; rowGroup < blocks.size(); rowGroup++) {
BlockMetaData metadata = blocks.get(rowGroup);
for (int rowGroup = 0; rowGroup < rowGroups.size(); rowGroup++) {
BlockMetaData metadata = rowGroups.get(rowGroup).blockMetaData();
for (PrimitiveField field : primitiveFields) {
int columnId = field.getId();
ColumnChunkMetaData chunkMetadata = getColumnChunkMetaData(metadata, field.getDescriptor());
Expand Down Expand Up @@ -317,11 +293,12 @@ private boolean advanceToNextRowGroup()
}

currentRowGroup++;
if (currentRowGroup == blocks.size()) {
if (currentRowGroup == rowGroups.size()) {
return false;
}
currentBlockMetadata = blocks.get(currentRowGroup);
firstRowIndexInGroup = firstRowsOfBlocks.get(currentRowGroup);
RowGroupInfo rowGroupInfo = rowGroups.get(currentRowGroup);
currentBlockMetadata = rowGroupInfo.blockMetaData();
firstRowIndexInGroup = rowGroupInfo.fileRowOffset();
currentGroupRowCount = currentBlockMetadata.getRowCount();
FilteredRowRanges currentGroupRowRanges = blockRowRanges[currentRowGroup];
log.debug("advanceToNextRowGroup dataSource %s, currentRowGroup %d, rowRanges %s, currentBlockMetadata %s", dataSource.getId(), currentRowGroup, currentGroupRowRanges, currentBlockMetadata);
Expand Down Expand Up @@ -450,7 +427,7 @@ private static Block toNotNullSupressedBlock(int positionCount, boolean[] rowIsN
@Nullable
private FilteredOffsetIndex getFilteredOffsetIndex(FilteredRowRanges rowRanges, int rowGroup, long rowGroupRowCount, ColumnPath columnPath)
{
Optional<ColumnIndexStore> rowGroupColumnIndexStore = this.columnIndexStore.get(rowGroup);
Optional<ColumnIndexStore> rowGroupColumnIndexStore = this.rowGroups.get(rowGroup).columnIndexStore();
if (rowGroupColumnIndexStore.isEmpty()) {
return null;
}
Expand Down Expand Up @@ -587,24 +564,24 @@ public AggregatedMemoryContext getMemoryContext()
}

private static FilteredRowRanges[] calculateFilteredRowRanges(
List<BlockMetaData> blocks,
List<RowGroupInfo> rowGroups,
Optional<FilterPredicate> filter,
List<Optional<ColumnIndexStore>> columnIndexStore,
List<PrimitiveField> primitiveFields)
{
FilteredRowRanges[] blockRowRanges = new FilteredRowRanges[blocks.size()];
FilteredRowRanges[] blockRowRanges = new FilteredRowRanges[rowGroups.size()];
if (filter.isEmpty()) {
return blockRowRanges;
}
Set<ColumnPath> paths = primitiveFields.stream()
.map(field -> ColumnPath.get(field.getDescriptor().getPath()))
.collect(toImmutableSet());
for (int rowGroup = 0; rowGroup < blocks.size(); rowGroup++) {
Optional<ColumnIndexStore> rowGroupColumnIndexStore = columnIndexStore.get(rowGroup);
for (int rowGroup = 0; rowGroup < rowGroups.size(); rowGroup++) {
RowGroupInfo rowGroupInfo = rowGroups.get(rowGroup);
Optional<ColumnIndexStore> rowGroupColumnIndexStore = rowGroupInfo.columnIndexStore();
if (rowGroupColumnIndexStore.isEmpty()) {
continue;
}
BlockMetaData metadata = blocks.get(rowGroup);
BlockMetaData metadata = rowGroupInfo.blockMetaData();
long rowGroupRowCount = metadata.getRowCount();
FilteredRowRanges rowRanges = new FilteredRowRanges(ColumnIndexFilter.calculateRowRanges(
FilterCompat.get(filter.get()),
Expand All @@ -627,11 +604,11 @@ private void validateWritePageChecksum(Page page)
}
}

private void validateBlockMetadata(List<BlockMetaData> blockMetaData)
private void validateBlockMetadata(List<RowGroupInfo> rowGroups)
throws ParquetCorruptionException
{
if (writeValidation.isPresent()) {
writeValidation.get().validateBlocksMetadata(dataSource.getId(), blockMetaData);
writeValidation.get().validateBlocksMetadata(dataSource.getId(), rowGroups);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.parquet.reader;

import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;

import java.util.Optional;

public record RowGroupInfo(BlockMetaData blockMetaData, long fileRowOffset, Optional<ColumnIndexStore> columnIndexStore) {}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import io.trino.parquet.ParquetWriteValidation;
import io.trino.parquet.reader.MetadataReader;
import io.trino.parquet.reader.ParquetReader;
import io.trino.parquet.reader.RowGroupInfo;
import io.trino.parquet.writer.ColumnWriter.BufferData;
import io.trino.spi.Page;
import io.trino.spi.type.Type;
Expand Down Expand Up @@ -69,7 +70,6 @@
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.util.Collections.nCopies;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0;

Expand Down Expand Up @@ -252,17 +252,15 @@ private ParquetReader createParquetReader(ParquetDataSource input, ParquetMetada
.orElseThrow()));
}
long nextStart = 0;
ImmutableList.Builder<Long> blockStartsBuilder = ImmutableList.builder();
ImmutableList.Builder<RowGroupInfo> rowGroupInfoBuilder = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
blockStartsBuilder.add(nextStart);
rowGroupInfoBuilder.add(new RowGroupInfo(block, nextStart, Optional.empty()));
nextStart += block.getRowCount();
}
List<Long> blockStarts = blockStartsBuilder.build();
return new ParquetReader(
Optional.ofNullable(fileMetaData.getCreatedBy()),
columnFields.build(),
parquetMetadata.getBlocks(),
blockStarts,
rowGroupInfoBuilder.build(),
input,
parquetTimeZone.orElseThrow(),
newSimpleAggregatedMemoryContext(),
Expand All @@ -272,7 +270,6 @@ private ParquetReader createParquetReader(ParquetDataSource input, ParquetMetada
return new RuntimeException(exception);
},
Optional.empty(),
nCopies(blockStarts.size(), Optional.empty()),
Optional.of(writeValidation));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import io.airlift.slice.Slices;
import io.trino.memory.context.AggregatedMemoryContext;
import io.trino.parquet.reader.ParquetReader;
import io.trino.parquet.reader.RowGroupInfo;
import io.trino.parquet.writer.ParquetSchemaConverter;
import io.trino.parquet.writer.ParquetWriter;
import io.trino.parquet.writer.ParquetWriterOptions;
Expand Down Expand Up @@ -55,7 +56,6 @@
import static io.trino.spi.block.MapBlock.fromKeyValueBlock;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.TypeUtils.writeNativeValue;
import static java.util.Collections.nCopies;
import static org.joda.time.DateTimeZone.UTC;

public class ParquetTestUtils
Expand Down Expand Up @@ -114,17 +114,15 @@ public static ParquetReader createParquetReader(
.orElseThrow()));
}
long nextStart = 0;
ImmutableList.Builder<Long> blockStartsBuilder = ImmutableList.builder();
ImmutableList.Builder<RowGroupInfo> rowGroupInfoBuilder = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
blockStartsBuilder.add(nextStart);
rowGroupInfoBuilder.add(new RowGroupInfo(block, nextStart, Optional.empty()));
nextStart += block.getRowCount();
}
List<Long> blockStarts = blockStartsBuilder.build();
return new ParquetReader(
Optional.ofNullable(fileMetaData.getCreatedBy()),
columnFields.build(),
parquetMetadata.getBlocks(),
blockStarts,
rowGroupInfoBuilder.build(),
input,
UTC,
memoryContext,
Expand All @@ -134,7 +132,6 @@ public static ParquetReader createParquetReader(
return new RuntimeException(exception);
},
Optional.empty(),
nCopies(blockStarts.size(), Optional.empty()),
Optional.empty());
}

Expand Down
Loading