Skip to content

Commit 8bf43b5

Browse files
jinyangli34raunaqmorarka
authored andcommitted
Write row group fileOffset in parquet file footer
1 parent 358633f commit 8bf43b5

File tree

5 files changed

+77
-35
lines changed

5 files changed

+77
-35
lines changed

lib/trino-parquet/src/main/java/io/trino/parquet/metadata/ParquetMetadata.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
*/
1414
package io.trino.parquet.metadata;
1515

16+
import com.google.common.annotations.VisibleForTesting;
1617
import com.google.common.collect.ImmutableList;
1718
import com.google.common.collect.ImmutableMap;
1819
import io.airlift.log.Logger;
@@ -138,6 +139,12 @@ public List<BlockMetadata> getBlocks()
138139
return blocks;
139140
}
140141

142+
@VisibleForTesting
143+
public FileMetaData getParquetMetadata()
144+
{
145+
return parquetMetadata;
146+
}
147+
141148
private static MessageType readParquetSchema(List<SchemaElement> schema)
142149
{
143150
Iterator<SchemaElement> schemaIterator = schema.iterator();

lib/trino-parquet/src/main/java/io/trino/parquet/writer/ParquetWriter.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ private void flush()
350350
columnMetaDataBuilder.add(columnMetaData);
351351
currentOffset += columnMetaData.getTotal_compressed_size();
352352
}
353-
updateRowGroups(columnMetaDataBuilder.build());
353+
updateRowGroups(columnMetaDataBuilder.build(), outputStream.longSize());
354354

355355
// flush pages
356356
for (BufferData bufferData : bufferDataList) {
@@ -409,12 +409,14 @@ private void writeBloomFilters(List<RowGroup> rowGroups, List<List<Optional<Bloo
409409
}
410410
}
411411

412-
private void updateRowGroups(List<ColumnMetaData> columnMetaData)
412+
private void updateRowGroups(List<ColumnMetaData> columnMetaData, long fileOffset)
413413
{
414414
long totalCompressedBytes = columnMetaData.stream().mapToLong(ColumnMetaData::getTotal_compressed_size).sum();
415415
long totalBytes = columnMetaData.stream().mapToLong(ColumnMetaData::getTotal_uncompressed_size).sum();
416416
ImmutableList<org.apache.parquet.format.ColumnChunk> columnChunks = columnMetaData.stream().map(ParquetWriter::toColumnChunk).collect(toImmutableList());
417-
fileFooter.addRowGroup(new RowGroup(columnChunks, totalBytes, rows).setTotal_compressed_size(totalCompressedBytes));
417+
fileFooter.addRowGroup(new RowGroup(columnChunks, totalBytes, rows)
418+
.setTotal_compressed_size(totalCompressedBytes)
419+
.setFile_offset(fileOffset));
418420
}
419421

420422
private static Slice serializeFooter(FileMetaData fileMetaData)

lib/trino-parquet/src/test/java/io/trino/parquet/writer/TestParquetWriter.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import org.apache.parquet.format.CompressionCodec;
4747
import org.apache.parquet.format.PageHeader;
4848
import org.apache.parquet.format.PageType;
49+
import org.apache.parquet.format.RowGroup;
4950
import org.apache.parquet.format.Util;
5051
import org.apache.parquet.schema.PrimitiveType;
5152
import org.assertj.core.data.Percentage;
@@ -379,6 +380,38 @@ public void testDictionaryPageOffset()
379380
}
380381
}
381382

383+
@Test
384+
void testRowGroupOffset()
385+
throws IOException
386+
{
387+
// Write a file with 100 rows per row-group
388+
List<String> columnNames = ImmutableList.of("columnA", "columnB");
389+
List<Type> types = ImmutableList.of(INTEGER, BIGINT);
390+
391+
ParquetDataSource dataSource = new TestingParquetDataSource(
392+
writeParquetFile(
393+
ParquetWriterOptions.builder()
394+
.setMaxBlockSize(DataSize.ofBytes(1000))
395+
.build(),
396+
types,
397+
columnNames,
398+
generateInputPages(types, 100, 10)),
399+
new ParquetReaderOptions());
400+
401+
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, Optional.empty());
402+
List<BlockMetadata> blocks = parquetMetadata.getBlocks();
403+
assertThat(blocks.size()).isGreaterThan(1);
404+
405+
List<RowGroup> rowGroups = parquetMetadata.getParquetMetadata().getRow_groups();
406+
assertThat(rowGroups.size()).isEqualTo(blocks.size());
407+
for (int rowGroupIndex = 0; rowGroupIndex < rowGroups.size(); rowGroupIndex++) {
408+
RowGroup rowGroup = rowGroups.get(rowGroupIndex);
409+
assertThat(rowGroup.isSetFile_offset()).isTrue();
410+
BlockMetadata blockMetadata = blocks.get(rowGroupIndex);
411+
assertThat(blockMetadata.getStartingPos()).isEqualTo(rowGroup.getFile_offset());
412+
}
413+
}
414+
382415
@ParameterizedTest
383416
@MethodSource("testWriteBloomFiltersParams")
384417
public void testWriteBloomFilters(Type type, List<?> data)

plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheFileOperations.java

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,12 @@ public void testCacheFileOperations()
9595
.add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 658))
9696
.add(new CacheOperation("InputFile.length", "00000000000000000003.json"))
9797
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
98-
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
99-
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
100-
.add(new CacheOperation("Input.readFully", "key=p1/", 0, 227))
101-
.add(new CacheOperation("Input.readFully", "key=p2/", 0, 227))
102-
.add(new CacheOperation("Alluxio.writeCache", "key=p1/", 0, 227))
103-
.add(new CacheOperation("Alluxio.writeCache", "key=p2/", 0, 227))
98+
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229))
99+
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 229))
100+
.add(new CacheOperation("Input.readFully", "key=p1/", 0, 229))
101+
.add(new CacheOperation("Input.readFully", "key=p2/", 0, 229))
102+
.add(new CacheOperation("Alluxio.writeCache", "key=p1/", 0, 229))
103+
.add(new CacheOperation("Alluxio.writeCache", "key=p2/", 0, 229))
104104
.build());
105105
assertFileSystemAccesses(
106106
"SELECT * FROM test_cache_file_operations",
@@ -113,8 +113,8 @@ public void testCacheFileOperations()
113113
.add(new CacheOperation("InputFile.length", "00000000000000000002.json"))
114114
.add(new CacheOperation("InputFile.length", "00000000000000000003.json"))
115115
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
116-
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
117-
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
116+
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229))
117+
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 229))
118118
.build());
119119
assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p3', '3-xyz')", 1);
120120
assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p4', '4-xyz')", 1);
@@ -139,17 +139,17 @@ public void testCacheFileOperations()
139139
.add(new CacheOperation("InputFile.length", "00000000000000000005.json"))
140140
.add(new CacheOperation("InputFile.length", "00000000000000000006.json"))
141141
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
142-
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
143-
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
144-
.add(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 227))
145-
.add(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 227))
146-
.add(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 227))
147-
.add(new CacheOperation("Input.readFully", "key=p3/", 0, 227))
148-
.add(new CacheOperation("Input.readFully", "key=p4/", 0, 227))
149-
.add(new CacheOperation("Input.readFully", "key=p5/", 0, 227))
150-
.add(new CacheOperation("Alluxio.writeCache", "key=p3/", 0, 227))
151-
.add(new CacheOperation("Alluxio.writeCache", "key=p4/", 0, 227))
152-
.add(new CacheOperation("Alluxio.writeCache", "key=p5/", 0, 227))
142+
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229))
143+
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 229))
144+
.add(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 229))
145+
.add(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 229))
146+
.add(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 229))
147+
.add(new CacheOperation("Input.readFully", "key=p3/", 0, 229))
148+
.add(new CacheOperation("Input.readFully", "key=p4/", 0, 229))
149+
.add(new CacheOperation("Input.readFully", "key=p5/", 0, 229))
150+
.add(new CacheOperation("Alluxio.writeCache", "key=p3/", 0, 229))
151+
.add(new CacheOperation("Alluxio.writeCache", "key=p4/", 0, 229))
152+
.add(new CacheOperation("Alluxio.writeCache", "key=p5/", 0, 229))
153153
.build());
154154
assertFileSystemAccesses(
155155
"SELECT * FROM test_cache_file_operations",
@@ -168,11 +168,11 @@ public void testCacheFileOperations()
168168
.add(new CacheOperation("InputFile.length", "00000000000000000005.json"))
169169
.add(new CacheOperation("InputFile.length", "00000000000000000006.json"))
170170
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
171-
.addCopies(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227), 1)
172-
.addCopies(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227), 1)
173-
.addCopies(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 227), 1)
174-
.addCopies(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 227), 1)
175-
.addCopies(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 227), 1)
171+
.addCopies(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 229), 1)
172+
.addCopies(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 229), 1)
173+
.addCopies(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 229), 1)
174+
.addCopies(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 229), 1)
175+
.addCopies(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 229), 1)
176176
.build());
177177
}
178178

plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAlluxioCacheMutableTransactionLog.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,12 @@ public void testTableDataCachedWhileTransactionLogNotCached()
7979
.addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2)
8080
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json"))
8181
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint"))
82-
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
83-
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
84-
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p1/", 0, 227))
85-
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p2/", 0, 227))
86-
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p1/", 0, 227))
87-
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p2/", 0, 227))
82+
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 229))
83+
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 229))
84+
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p1/", 0, 229))
85+
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p2/", 0, 229))
86+
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p1/", 0, 229))
87+
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p2/", 0, 229))
8888
.build());
8989
assertFileSystemAccesses(
9090
"SELECT * FROM test_transaction_log_not_cached",
@@ -93,8 +93,8 @@ public void testTableDataCachedWhileTransactionLogNotCached()
9393
.addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2)
9494
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json"))
9595
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint"))
96-
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
97-
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
96+
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 229))
97+
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 229))
9898
.build());
9999
}
100100

0 commit comments

Comments
 (0)