Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public abstract class DictionaryValuesWriter extends ValuesWriter implements Req
protected boolean dictionaryTooBig;

/* current size in bytes the dictionary will take once serialized */
protected int dictionaryByteSize;
protected long dictionaryByteSize;

/* size in bytes of the dictionary at the end of last dictionary encoded page (in case the current page falls back to PLAIN) */
protected int lastUsedDictionaryByteSize;
Expand Down Expand Up @@ -173,7 +173,7 @@ public BytesInput getBytes() {
BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
// remember size of dictionary when we last wrote a page
lastUsedDictionarySize = getDictionarySize();
lastUsedDictionaryByteSize = dictionaryByteSize;
lastUsedDictionaryByteSize = (int) dictionaryByteSize;
Copy link
Contributor

@gszadovszky gszadovszky May 21, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method should not be called in case of shouldFallBack() returns true but we should be on the safe side and check the potential overflow.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. Changed to Math.toIntExact so it will throw exception when overflow.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that user continues writing that large string, will here throw an exception? And it will block the writer, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. With the fix, it should've already fallback to PLAIN encoding during writeBytes before getting here.

return bytes;
} catch (IOException e) {
throw new ParquetEncodingException("could not encode the values", e);
Expand Down Expand Up @@ -249,7 +249,7 @@ public void writeBytes(Binary v) {
id = binaryDictionaryContent.size();
binaryDictionaryContent.put(v.copy(), id);
// length as int (4 bytes) + actual bytes
dictionaryByteSize += 4 + v.length();
dictionaryByteSize += 4L + v.length();
}
encodedValues.add(id);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.apache.parquet.column.values.plain.PlainValuesWriter;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.mockito.Mockito;

public class TestDictionary {

Expand Down Expand Up @@ -171,6 +172,20 @@ public void testBinaryDictionaryFallBack() throws IOException {
assertEquals(0, cw.getBufferedSize());
}

@Test
public void testBinaryDictionaryIntegerOverflow() {
Binary mock = Mockito.mock(Binary.class);
Mockito.when(mock.length()).thenReturn(Integer.MAX_VALUE - 1);
// make the writer happy
Mockito.when(mock.copy()).thenReturn(Binary.fromString(" world"));

final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(100, 100);
cw.writeBytes(Binary.fromString("hello"));
cw.writeBytes(mock);

assertEquals(PLAIN, cw.getEncoding());
}

@Test
public void testBinaryDictionaryChangedValues() throws IOException {
int COUNT = 100;
Expand Down