-
Notifications
You must be signed in to change notification settings - Fork 1.6k
PARQUET-1310: Column indexes: Filtering #509
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
465cca6
c00b2c0
3c9fe3e
c471c72
c0ce0f3
7d5faea
f9b6ecc
e82b9d5
325e5f6
659c1a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,16 +18,27 @@ | |
| */ | ||
| package org.apache.parquet.column.page; | ||
|
|
||
| import org.apache.parquet.Preconditions; | ||
|
|
||
| /** | ||
| * one data page in a chunk | ||
| */ | ||
| abstract public class DataPage extends Page { | ||
|
|
||
| private final int valueCount; | ||
| private final long firstRowIndex; | ||
|
|
||
| DataPage(int compressedSize, int uncompressedSize, int valueCount) { | ||
| super(compressedSize, uncompressedSize); | ||
| this.valueCount = valueCount; | ||
| this.firstRowIndex = -1; | ||
| } | ||
|
|
||
| DataPage(int compressedSize, int uncompressedSize, int valueCount, long firstRowIndex) { | ||
| super(compressedSize, uncompressedSize); | ||
| Preconditions.checkArgument(firstRowIndex >= 0, "First row index {} should be non-negative", firstRowIndex); | ||
| this.valueCount = valueCount; | ||
| this.firstRowIndex = firstRowIndex; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -37,6 +48,20 @@ public int getValueCount() { | |
| return valueCount; | ||
| } | ||
|
|
||
| /** | ||
| * @return the index of the first row in this page | ||
| * @throws IllegalStateException | ||
| * if no row synchronization is required | ||
| * @see PageReadStore#isRowSynchronizationRequired() | ||
| */ | ||
| public long getFirstRowIndex() { | ||
| if (firstRowIndex < 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should there be a way to query this state without relying on an exception being thrown? |
||
| throw new IllegalStateException( | ||
| "No row synchronization is required; all pages shall be read."); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see a large amount of IllegalStateException-s scattered around the code with very similar texts about row synchronization not being required (but still being a little bit cryptic about what this means). Could you please create a separate Exception class for these so that it's not repeated all over the code with minor differences in wording? This would also allow a more verbose centralized description in the javadoc of the exception. |
||
| } | ||
| return firstRowIndex; | ||
| } | ||
|
|
||
| public abstract <T> T accept(Visitor<T> visitor); | ||
|
|
||
| public static interface Visitor<T> { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| */ | ||
| package org.apache.parquet.column.page; | ||
|
|
||
| import java.util.PrimitiveIterator; | ||
| import org.apache.parquet.column.ColumnDescriptor; | ||
|
|
||
| /** | ||
|
|
@@ -40,4 +41,26 @@ public interface PageReadStore { | |
| */ | ||
| long getRowCount(); | ||
|
|
||
| /** | ||
| * Returns the indexes of the rows to be read/built. All the rows which index is not returned shall be skipped. | ||
| * | ||
| * @return the incremental iterator of the row indexes | ||
| * @throws IllegalStateException | ||
| * if no row synchronization is required | ||
| * @see #isRowSynchronizationRequired() | ||
| */ | ||
| default PrimitiveIterator.OfLong getRowIndexes() { | ||
| throw new IllegalStateException("Row synchronization is not required; row indexes are not available"); | ||
| } | ||
|
|
||
| /** | ||
| * If row synchronization is required then some values might have to be skipped to get the rows in synch between the | ||
| * pages. | ||
| * | ||
| * @return {@code true} if row synchronization is required; {@code false} otherwise | ||
| * @see DataPage#getFirstRowIndex() | ||
| */ | ||
| default boolean isRowSynchronizationRequired() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (nit) Enabled or Active may be a better word than Required. |
||
| return false; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,12 +22,13 @@ | |
| import java.util.ArrayList; | ||
| import java.util.List; | ||
|
|
||
| import org.apache.parquet.filter2.predicate.Statistics; | ||
| import org.apache.parquet.io.api.Binary; | ||
| import org.apache.parquet.schema.PrimitiveComparator; | ||
| import org.apache.parquet.schema.PrimitiveType; | ||
|
|
||
| class BinaryColumnIndexBuilder extends ColumnIndexBuilder { | ||
| private static class BinaryColumnIndex extends ColumnIndexBase { | ||
| private static class BinaryColumnIndex extends ColumnIndexBase<Binary> { | ||
| private Binary[] minValues; | ||
| private Binary[] maxValues; | ||
|
|
||
|
|
@@ -54,6 +55,28 @@ String getMinValueAsString(int pageIndex) { | |
| String getMaxValueAsString(int pageIndex) { | ||
| return stringifier.stringify(maxValues[pageIndex]); | ||
| } | ||
|
|
||
| @Override | ||
| @SuppressWarnings("unchecked") | ||
| <T extends Comparable<T>> Statistics<T> createStats(int arrayIndex) { | ||
| return (Statistics<T>) new Statistics<Binary>(minValues[arrayIndex], maxValues[arrayIndex], comparator); | ||
| } | ||
|
|
||
| @Override | ||
| ValueComparator createValueComparator(Object value) { | ||
| final Binary v = (Binary) value; | ||
| return new ValueComparator() { | ||
| @Override | ||
| int compareValueToMin(int arrayIndex) { | ||
| return comparator.compare(v, minValues[arrayIndex]); | ||
| } | ||
|
|
||
| @Override | ||
| int compareValueToMax(int arrayIndex) { | ||
| return comparator.compare(v, maxValues[arrayIndex]); | ||
| } | ||
| }; | ||
| } | ||
| } | ||
|
|
||
| private final List<Binary> minValues = new ArrayList<>(); | ||
|
|
@@ -76,8 +99,8 @@ private static ByteBuffer convert(Binary value) { | |
|
|
||
| @Override | ||
| void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { | ||
| minValues.add(min == null ? null : convert(min)); | ||
| maxValues.add(max == null ? null : convert(max)); | ||
| minValues.add(convert(min)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did this check become unnecessary?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| maxValues.add(convert(max)); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -87,7 +110,7 @@ void addMinMax(Object min, Object max) { | |
| } | ||
|
|
||
| @Override | ||
| ColumnIndexBase createColumnIndex(PrimitiveType type) { | ||
| ColumnIndexBase<Binary> createColumnIndex(PrimitiveType type) { | ||
| BinaryColumnIndex columnIndex = new BinaryColumnIndex(type); | ||
| columnIndex.minValues = minValues.toArray(new Binary[minValues.size()]); | ||
| columnIndex.maxValues = maxValues.toArray(new Binary[maxValues.size()]); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(nit) I would use a different wording in the comment and in the exception as well, for example:
Could you also give a few hints about when this happens or what this means?