Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData
* (0 cannot be a valid offset because of the MAGIC bytes)
* - The firstDataPageOffset might point to the dictionary page
*/
dictionaryPageSize = readDictionaryPageSize(input, newChunkStart);
dictionaryPageSize = readDictionaryPageSize(input, chunk);
} else {
dictionaryPageSize = chunk.getFirstDataPageOffset() - chunk.getDictionaryPageOffset();
}
Expand All @@ -68,12 +68,14 @@ public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData
return new Offsets(firstDataPageOffset, dictionaryPageOffset);
}

private static long readDictionaryPageSize(SeekableInputStream in, long pos) throws IOException {
private static long readDictionaryPageSize(SeekableInputStream in, ColumnChunkMetaData chunk) throws IOException {
long origPos = -1;
try {
origPos = in.getPos();
in.seek(chunk.getStartingPos());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we assume the dictionary page is always the chunk starting address?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not obvious that one have to search this statements in the Encoding docs but it is there:

The dictionary page is written first, before the data pages of the column chunk.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it is true today, but what if that assumption is broken when more and more page types are added. Can we add something in Encoding docs to not let people change that assumption?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it should be specified more clearly and maybe not only in the Encoding doc but somewhere in the "main" page but I feel it a separate topic.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you create a Jira for it @gszadovszky so that we don't lose tracking of it?

Other than that, LGTM!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, @shangxinli. Check out PARQUET-2034 for details.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @gszadovszky

long headerStart = in.getPos();
PageHeader header = Util.readPageHeader(in);
long headerSize = in.getPos() - origPos;
long headerSize = in.getPos() - headerStart;
return headerSize + header.getCompressed_page_size();
} finally {
if (origPos != -1) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
/*
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
Expand Down Expand Up @@ -38,6 +38,7 @@
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
Expand Down Expand Up @@ -68,6 +69,17 @@ public class TestParquetWriterAppendBlocks {
public static final SimpleGroupFactory GROUP_FACTORY =
new SimpleGroupFactory(FILE_SCHEMA);

private static final Path STATIC_FILE_1 = createPathFromCP("/test-append_1.parquet");
private static final Path STATIC_FILE_2 = createPathFromCP("/test-append_2.parquet");

private static Path createPathFromCP(String path) {
try {
return new Path(TestParquetWriterAppendBlocks.class.getResource(path).toURI());
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}

public Path file1;
public List<Group> file1content = new ArrayList<Group>();
public Path file2;
Expand Down Expand Up @@ -134,6 +146,51 @@ public void testBasicBehavior() throws IOException {
Assert.assertEquals("All records should be present", 0, expected.size());
}

/**
* This test is similar to {@link #testBasicBehavior()} only that it uses static files generated by a previous release
* (1.11.1). This test is to validate the fix of PARQUET-2027.
*/
@Test
public void testBasicBehaviorWithStaticFiles() throws IOException {
List<Group> expected = new ArrayList<>();
readAll(STATIC_FILE_1, expected);
readAll(STATIC_FILE_2, expected);

Path combinedFile = newTemp();
ParquetFileWriter writer = new ParquetFileWriter(
CONF, FILE_SCHEMA, combinedFile);
writer.start();
writer.appendFile(CONF, STATIC_FILE_1);
writer.appendFile(CONF, STATIC_FILE_2);
writer.end(EMPTY_METADATA);

try (ParquetReader<Group> reader = ParquetReader
.builder(new GroupReadSupport(), combinedFile)
.build()) {

for (Group expectedNext : expected) {
Group next = reader.read();
// check each value; equals is not supported for simple records
Assert.assertEquals("Each id should match",
expectedNext.getInteger("id", 0), next.getInteger("id", 0));
Assert.assertEquals("Each string should match",
expectedNext.getString("string", 0), next.getString("string", 0));
}
Assert.assertNull("No extra records should be present", reader.read());
}

}

private void readAll(Path file, List<Group> values) throws IOException {
try (ParquetReader<Group> reader = ParquetReader
.builder(new GroupReadSupport(), file)
.build()) {
for (Group g = reader.read(); g != null; g = reader.read()) {
values.add(g);
}
}
}

@Test
public void testMergedMetadata() throws IOException {
Path combinedFile = newTemp();
Expand Down
Binary file not shown.
Binary file not shown.