diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java index 2999f3cdc4..c4a985224f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java @@ -81,7 +81,7 @@ public abstract class DictionaryValuesWriter extends ValuesWriter implements Req protected boolean dictionaryTooBig; /* current size in bytes the dictionary will take once serialized */ - protected int dictionaryByteSize; + protected long dictionaryByteSize; /* size in bytes of the dictionary at the end of last dictionary encoded page (in case the current page falls back to PLAIN) */ protected int lastUsedDictionaryByteSize; @@ -173,7 +173,7 @@ public BytesInput getBytes() { BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes); // remember size of dictionary when we last wrote a page lastUsedDictionarySize = getDictionarySize(); - lastUsedDictionaryByteSize = dictionaryByteSize; + lastUsedDictionaryByteSize = Math.toIntExact(dictionaryByteSize); return bytes; } catch (IOException e) { throw new ParquetEncodingException("could not encode the values", e); @@ -249,7 +249,7 @@ public void writeBytes(Binary v) { id = binaryDictionaryContent.size(); binaryDictionaryContent.put(v.copy(), id); // length as int (4 bytes) + actual bytes - dictionaryByteSize += 4 + v.length(); + dictionaryByteSize += 4L + v.length(); } encodedValues.add(id); } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java index 2783b696d5..174fad8918 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java @@ -53,6 +53,7 @@ import org.apache.parquet.column.values.plain.PlainValuesWriter; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.mockito.Mockito; public class TestDictionary { @@ -171,6 +172,20 @@ public void testBinaryDictionaryFallBack() throws IOException { assertEquals(0, cw.getBufferedSize()); } + @Test + public void testBinaryDictionaryIntegerOverflow() { + Binary mock = Mockito.mock(Binary.class); + Mockito.when(mock.length()).thenReturn(Integer.MAX_VALUE - 1); + // make the writer happy + Mockito.when(mock.copy()).thenReturn(Binary.fromString(" world")); + + final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(100, 100); + cw.writeBytes(Binary.fromString("hello")); + cw.writeBytes(mock); + + assertEquals(PLAIN, cw.getEncoding()); + } + @Test public void testBinaryDictionaryChangedValues() throws IOException { int COUNT = 100; diff --git a/pom.xml b/pom.xml index 5d37dc0679..ec4d56621c 100644 --- a/pom.xml +++ b/pom.xml @@ -501,6 +501,10 @@ ${shade.prefix} + + org.apache.parquet.column.values.dictionary.DictionaryValuesWriter#dictionaryByteSize