diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java index fa25943b82..2edc585c4b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java @@ -55,7 +55,7 @@ public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData * (0 cannot be a valid offset because of the MAGIC bytes) * - The firstDataPageOffset might point to the dictionary page */ - dictionaryPageSize = readDictionaryPageSize(input, newChunkStart); + dictionaryPageSize = readDictionaryPageSize(input, chunk); } else { dictionaryPageSize = chunk.getFirstDataPageOffset() - chunk.getDictionaryPageOffset(); } @@ -68,12 +68,14 @@ public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData return new Offsets(firstDataPageOffset, dictionaryPageOffset); } - private static long readDictionaryPageSize(SeekableInputStream in, long pos) throws IOException { + private static long readDictionaryPageSize(SeekableInputStream in, ColumnChunkMetaData chunk) throws IOException { long origPos = -1; try { origPos = in.getPos(); + in.seek(chunk.getStartingPos()); + long headerStart = in.getPos(); PageHeader header = Util.readPageHeader(in); - long headerSize = in.getPos() - origPos; + long headerSize = in.getPos() - headerStart; return headerSize + header.getCompressed_page_size(); } finally { if (origPos != -1) { diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java index bda5333523..82d48f411d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterAppendBlocks.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -38,6 +38,7 @@ import org.junit.rules.TemporaryFolder; import java.io.File; import java.io.IOException; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; @@ -68,6 +69,17 @@ public class TestParquetWriterAppendBlocks { public static final SimpleGroupFactory GROUP_FACTORY = new SimpleGroupFactory(FILE_SCHEMA); + private static final Path STATIC_FILE_1 = createPathFromCP("/test-append_1.parquet"); + private static final Path STATIC_FILE_2 = createPathFromCP("/test-append_2.parquet"); + + private static Path createPathFromCP(String path) { + try { + return new Path(TestParquetWriterAppendBlocks.class.getResource(path).toURI()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + public Path file1; public List file1content = new ArrayList(); public Path file2; @@ -134,6 +146,51 @@ public void testBasicBehavior() throws IOException { Assert.assertEquals("All records should be present", 0, expected.size()); } + /** + * This test is similar to {@link #testBasicBehavior()} only that it uses static files generated by a previous release + * (1.11.1). This test is to validate the fix of PARQUET-2027. + */ + @Test + public void testBasicBehaviorWithStaticFiles() throws IOException { + List expected = new ArrayList<>(); + readAll(STATIC_FILE_1, expected); + readAll(STATIC_FILE_2, expected); + + Path combinedFile = newTemp(); + ParquetFileWriter writer = new ParquetFileWriter( + CONF, FILE_SCHEMA, combinedFile); + writer.start(); + writer.appendFile(CONF, STATIC_FILE_1); + writer.appendFile(CONF, STATIC_FILE_2); + writer.end(EMPTY_METADATA); + + try (ParquetReader reader = ParquetReader + .builder(new GroupReadSupport(), combinedFile) + .build()) { + + for (Group expectedNext : expected) { + Group next = reader.read(); + // check each value; equals is not supported for simple records + Assert.assertEquals("Each id should match", + expectedNext.getInteger("id", 0), next.getInteger("id", 0)); + Assert.assertEquals("Each string should match", + expectedNext.getString("string", 0), next.getString("string", 0)); + } + Assert.assertNull("No extra records should be present", reader.read()); + } + + } + + private void readAll(Path file, List values) throws IOException { + try (ParquetReader reader = ParquetReader + .builder(new GroupReadSupport(), file) + .build()) { + for (Group g = reader.read(); g != null; g = reader.read()) { + values.add(g); + } + } + } + @Test public void testMergedMetadata() throws IOException { Path combinedFile = newTemp(); diff --git a/parquet-hadoop/src/test/resources/test-append_1.parquet b/parquet-hadoop/src/test/resources/test-append_1.parquet new file mode 100644 index 0000000000..a255f86ebc Binary files /dev/null and b/parquet-hadoop/src/test/resources/test-append_1.parquet differ diff --git a/parquet-hadoop/src/test/resources/test-append_2.parquet b/parquet-hadoop/src/test/resources/test-append_2.parquet new file mode 100644 index 0000000000..3081f893f5 Binary files /dev/null and b/parquet-hadoop/src/test/resources/test-append_2.parquet differ