Skip to content

Commit 2ef6dc1

Browse files
jinyangli34raunaqmorarka
authored andcommitted
Parse only required row groups from parquet footer
1 parent 8bf43b5 commit 2ef6dc1

File tree

7 files changed

+60
-6
lines changed

7 files changed

+60
-6
lines changed

lib/trino-parquet/src/main/java/io/trino/parquet/metadata/ParquetMetadata.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ public String toString()
9090

9191
public List<BlockMetadata> getBlocks()
9292
throws ParquetCorruptionException
93+
{
94+
return getBlocks(0, Long.MAX_VALUE);
95+
}
96+
97+
public List<BlockMetadata> getBlocks(long splitStart, long splitLength)
98+
throws ParquetCorruptionException
9399
{
94100
List<SchemaElement> schema = parquetMetadata.getSchema();
95101
validateParquet(!schema.isEmpty(), dataSourceId, "Schema is empty");
@@ -99,6 +105,14 @@ public List<BlockMetadata> getBlocks()
99105
List<RowGroup> rowGroups = parquetMetadata.getRow_groups();
100106
if (rowGroups != null) {
101107
for (RowGroup rowGroup : rowGroups) {
108+
if (rowGroup.isSetFile_offset()) {
109+
long rowGroupStart = rowGroup.getFile_offset();
110+
boolean splitContainsRowGroup = splitStart <= rowGroupStart && rowGroupStart < splitStart + splitLength;
111+
if (!splitContainsRowGroup) {
112+
continue;
113+
}
114+
}
115+
102116
List<ColumnChunk> columns = rowGroup.getColumns();
103117
validateParquet(!columns.isEmpty(), dataSourceId, "No columns in row group: %s", rowGroup);
104118
String filePath = columns.get(0).getFile_path();

lib/trino-parquet/src/main/java/io/trino/parquet/predicate/PredicateUtils.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import io.trino.parquet.ParquetReaderOptions;
2828
import io.trino.parquet.metadata.BlockMetadata;
2929
import io.trino.parquet.metadata.ColumnChunkMetadata;
30+
import io.trino.parquet.metadata.ParquetMetadata;
3031
import io.trino.parquet.metadata.PrunedBlockMetadata;
3132
import io.trino.parquet.reader.RowGroupInfo;
3233
import io.trino.spi.predicate.TupleDomain;
@@ -183,7 +184,7 @@ public static List<RowGroupInfo> getFilteredRowGroups(
183184
long splitStart,
184185
long splitLength,
185186
ParquetDataSource dataSource,
186-
List<BlockMetadata> blocksMetaData,
187+
ParquetMetadata parquetMetadata,
187188
List<TupleDomain<ColumnDescriptor>> parquetTupleDomains,
188189
List<TupleDomainParquetPredicate> parquetPredicates,
189190
Map<List<String>, ColumnDescriptor> descriptorsByPath,
@@ -194,7 +195,7 @@ public static List<RowGroupInfo> getFilteredRowGroups(
194195
{
195196
long fileRowCount = 0;
196197
ImmutableList.Builder<RowGroupInfo> rowGroupInfoBuilder = ImmutableList.builder();
197-
for (BlockMetadata block : blocksMetaData) {
198+
for (BlockMetadata block : parquetMetadata.getBlocks(splitStart, splitLength)) {
198199
long blockStart = block.getStartingPos();
199200
boolean splitContainsBlock = splitStart <= blockStart && blockStart < splitStart + splitLength;
200201
if (splitContainsBlock) {

lib/trino-parquet/src/test/java/io/trino/parquet/ParquetTestUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ public static ParquetReader createParquetReader(
155155
0,
156156
input.getEstimatedSize(),
157157
input,
158-
parquetMetadata.getBlocks(),
158+
parquetMetadata,
159159
ImmutableList.of(parquetTupleDomain),
160160
ImmutableList.of(parquetPredicate),
161161
descriptorsByPath,

lib/trino-parquet/src/test/java/io/trino/parquet/reader/TestParquetReader.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import io.trino.memory.context.AggregatedMemoryContext;
2121
import io.trino.parquet.ParquetDataSource;
2222
import io.trino.parquet.ParquetReaderOptions;
23+
import io.trino.parquet.metadata.BlockMetadata;
2324
import io.trino.parquet.metadata.ParquetMetadata;
2425
import io.trino.parquet.writer.ParquetWriterOptions;
2526
import io.trino.spi.Page;
@@ -186,6 +187,44 @@ public void testBackwardsCompatibleRepeatedPrimitiveFieldDefinedAsPrimitive()
186187
.isInstanceOf(TrinoException.class);
187188
}
188189

190+
@Test
191+
void testReadMetadataWithSplitOffset()
192+
throws IOException
193+
{
194+
// Write a file with 100 rows per row-group
195+
List<String> columnNames = ImmutableList.of("columna", "columnb");
196+
List<Type> types = ImmutableList.of(INTEGER, BIGINT);
197+
198+
ParquetDataSource dataSource = new TestingParquetDataSource(
199+
writeParquetFile(
200+
ParquetWriterOptions.builder()
201+
.setMaxBlockSize(DataSize.ofBytes(1000))
202+
.build(),
203+
types,
204+
columnNames,
205+
generateInputPages(types, 100, 5)),
206+
new ParquetReaderOptions());
207+
208+
// Read both columns, 1 row group
209+
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, Optional.empty());
210+
List<BlockMetadata> columnBlocks = parquetMetadata.getBlocks(0, 800);
211+
assertThat(columnBlocks.size()).isEqualTo(1);
212+
assertThat(columnBlocks.getFirst().columns().size()).isEqualTo(2);
213+
assertThat(columnBlocks.getFirst().rowCount()).isEqualTo(100);
214+
215+
// Read both columns, half row groups
216+
parquetMetadata = MetadataReader.readFooter(dataSource, Optional.empty());
217+
columnBlocks = parquetMetadata.getBlocks(0, 2500);
218+
assertThat(columnBlocks.stream().allMatch(block -> block.columns().size() == 2)).isTrue();
219+
assertThat(columnBlocks.stream().mapToLong(BlockMetadata::rowCount).sum()).isEqualTo(300);
220+
221+
// Read both columns, all row groups
222+
parquetMetadata = MetadataReader.readFooter(dataSource, Optional.empty());
223+
columnBlocks = parquetMetadata.getBlocks();
224+
assertThat(columnBlocks.stream().allMatch(block -> block.columns().size() == 2)).isTrue();
225+
assertThat(columnBlocks.stream().mapToLong(BlockMetadata::rowCount).sum()).isEqualTo(500);
226+
}
227+
189228
private void testReadingOldParquetFiles(File file, List<String> columnNames, Type columnType, List<?> expectedValues)
190229
throws IOException
191230
{

plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ public static ReaderPageSource createPageSource(
253253
start,
254254
length,
255255
dataSource,
256-
parquetMetadata.getBlocks(),
256+
parquetMetadata,
257257
parquetTupleDomains,
258258
parquetPredicates,
259259
descriptorsByPath,

plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiPageSourceProvider.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ private static ConnectorPageSource createPageSource(
218218
start,
219219
length,
220220
dataSource,
221-
parquetMetadata.getBlocks(),
221+
parquetMetadata,
222222
ImmutableList.of(parquetTupleDomain),
223223
ImmutableList.of(parquetPredicate),
224224
descriptorsByPath,

plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProvider.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -891,7 +891,7 @@ private static ReaderPageSourceWithRowPositions createParquetPageSource(
891891
start,
892892
length,
893893
dataSource,
894-
parquetMetadata.getBlocks(),
894+
parquetMetadata,
895895
ImmutableList.of(parquetTupleDomain),
896896
ImmutableList.of(parquetPredicate),
897897
descriptorsByPath,

0 commit comments

Comments
 (0)