-
Notifications
You must be signed in to change notification settings - Fork 1.5k
PARQUET-1328: Add Bloom filter reader and writer #521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
81c3063
1a0875b
e3991ee
05aac07
b8a0f5c
1b646a9
f03d875
4fcd761
5e4647f
894040d
fb0ab5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,21 +18,23 @@ | |
| */ | ||
| package org.apache.parquet.column; | ||
|
|
||
| import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; | ||
|
|
||
| import java.util.HashMap; | ||
|
|
||
| import org.apache.parquet.Preconditions; | ||
| import org.apache.parquet.bytes.ByteBufferAllocator; | ||
| import org.apache.parquet.bytes.CapacityByteArrayOutputStream; | ||
| import org.apache.parquet.bytes.HeapByteBufferAllocator; | ||
|
|
||
| import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; | ||
| import org.apache.parquet.column.impl.ColumnWriteStoreV1; | ||
| import org.apache.parquet.column.impl.ColumnWriteStoreV2; | ||
| import org.apache.parquet.column.page.PageWriteStore; | ||
| import org.apache.parquet.column.values.ValuesWriter; | ||
| import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter; | ||
| import org.apache.parquet.column.values.factory.DefaultValuesWriterFactory; | ||
| import org.apache.parquet.column.values.factory.ValuesWriterFactory; | ||
| import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder; | ||
| import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter; | ||
| import org.apache.parquet.column.values.factory.ValuesWriterFactory; | ||
| import org.apache.parquet.schema.MessageType; | ||
|
|
||
| /** | ||
|
|
@@ -47,6 +49,7 @@ public class ParquetProperties { | |
| public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true; | ||
| public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100; | ||
| public static final int DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK = 10000; | ||
| public static final boolean DEFAULT_BLOOM_FILTER_ENABLED = false; | ||
|
|
||
| public static final ValuesWriterFactory DEFAULT_VALUES_WRITER_FACTORY = new DefaultValuesWriterFactory(); | ||
|
|
||
|
|
@@ -83,10 +86,12 @@ public static WriterVersion fromString(String name) { | |
| private final boolean estimateNextSizeCheck; | ||
| private final ByteBufferAllocator allocator; | ||
| private final ValuesWriterFactory valuesWriterFactory; | ||
| private final boolean enableBloomFilter; | ||
| private final HashMap<String, Long> bloomFilterInfo; | ||
|
||
|
|
||
| private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck, | ||
| int maxRowCountForPageSizeCheck, boolean estimateNextSizeCheck, ByteBufferAllocator allocator, | ||
| ValuesWriterFactory writerFactory) { | ||
| ValuesWriterFactory writerFactory, boolean enableBloomFilter, HashMap<String, Long> bloomFilterInfo) { | ||
| this.pageSizeThreshold = pageSize; | ||
| this.initialSlabSize = CapacityByteArrayOutputStream | ||
| .initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10); | ||
|
|
@@ -97,7 +102,8 @@ private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPag | |
| this.maxRowCountForPageSizeCheck = maxRowCountForPageSizeCheck; | ||
| this.estimateNextSizeCheck = estimateNextSizeCheck; | ||
| this.allocator = allocator; | ||
|
|
||
| this.enableBloomFilter = enableBloomFilter; | ||
| this.bloomFilterInfo = bloomFilterInfo; | ||
| this.valuesWriterFactory = writerFactory; | ||
| } | ||
|
|
||
|
|
@@ -159,6 +165,14 @@ public ByteBufferAllocator getAllocator() { | |
| return allocator; | ||
| } | ||
|
|
||
| public boolean isBloomFilterEnabled() { | ||
| return enableBloomFilter; | ||
| } | ||
|
|
||
| public HashMap<String, Long> getBloomFilterInfo() { | ||
| return bloomFilterInfo; | ||
| } | ||
|
|
||
| public ColumnWriteStore newColumnWriteStore(MessageType schema, | ||
| PageWriteStore pageStore) { | ||
| switch (writerVersion) { | ||
|
|
@@ -199,6 +213,8 @@ public static class Builder { | |
| private int pageSize = DEFAULT_PAGE_SIZE; | ||
| private int dictPageSize = DEFAULT_DICTIONARY_PAGE_SIZE; | ||
| private boolean enableDict = DEFAULT_IS_DICTIONARY_ENABLED; | ||
| private boolean enableBloomFilter = DEFAULT_BLOOM_FILTER_ENABLED; | ||
| private HashMap<String, Long> bloomFilterInfo = new HashMap<>(); | ||
| private WriterVersion writerVersion = DEFAULT_WRITER_VERSION; | ||
| private int minRowCountForPageSizeCheck = DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK; | ||
| private int maxRowCountForPageSizeCheck = DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK; | ||
|
|
@@ -217,6 +233,8 @@ private Builder(ParquetProperties toCopy) { | |
| this.maxRowCountForPageSizeCheck = toCopy.maxRowCountForPageSizeCheck; | ||
| this.estimateNextSizeCheck = toCopy.estimateNextSizeCheck; | ||
| this.allocator = toCopy.allocator; | ||
| this.enableBloomFilter = toCopy.enableBloomFilter; | ||
| this.bloomFilterInfo = toCopy.bloomFilterInfo; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -256,6 +274,38 @@ public Builder withDictionaryPageSize(int dictionaryPageSize) { | |
| return this; | ||
| } | ||
|
|
||
| /** | ||
| * Set to enable Bloom filter. | ||
| * | ||
| * @param enableBloomFilter a boolean to indicate whether to enable Bloom filter. | ||
| * @return this builder for method chaining. | ||
| */ | ||
| public Builder withBloomFilterEnabled(boolean enableBloomFilter) { | ||
| this.enableBloomFilter = enableBloomFilter; | ||
| return this; | ||
| } | ||
|
|
||
| /** | ||
| * Set Bloom filter info for columns. | ||
| * | ||
| * @param names the columns to be enable for Bloom filter | ||
|
||
| * @param sizes the sizes corresponding to columns | ||
|
||
| * @return this builder for method chaining | ||
| */ | ||
| public Builder withBloomFilterInfo(String names, String sizes) { | ||
|
||
| String[] bloomFilterColumns = names.split(","); | ||
| String[] bloomFilterSizes = sizes.split(","); | ||
|
|
||
| Preconditions.checkArgument(bloomFilterColumns.length == bloomFilterSizes.length, | ||
| "Column names are not matched to sizes"); | ||
|
|
||
| for (int i = 0; i < bloomFilterColumns.length; i++) { | ||
| bloomFilterInfo.put(bloomFilterColumns[i], Long.getLong(bloomFilterSizes[i])); | ||
| } | ||
|
|
||
| return this; | ||
| } | ||
|
|
||
| /** | ||
| * Set the {@link WriterVersion format version}. | ||
| * | ||
|
|
@@ -303,7 +353,8 @@ public ParquetProperties build() { | |
| ParquetProperties properties = | ||
| new ParquetProperties(writerVersion, pageSize, dictPageSize, | ||
| enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck, | ||
| estimateNextSizeCheck, allocator, valuesWriterFactory); | ||
| estimateNextSizeCheck, allocator, valuesWriterFactory, | ||
| enableBloomFilter, bloomFilterInfo); | ||
| // we pass a constructed but uninitialized factory to ParquetProperties above as currently | ||
| // creation of ValuesWriters is invoked from within ParquetProperties. In the future | ||
| // we'd like to decouple that and won't need to pass an object to properties and then pass the | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,8 @@ | |
| import org.apache.parquet.column.ParquetProperties; | ||
| import org.apache.parquet.column.page.PageWriteStore; | ||
| import org.apache.parquet.column.page.PageWriter; | ||
| import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; | ||
| import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; | ||
| import org.apache.parquet.schema.MessageType; | ||
|
|
||
| public class ColumnWriteStoreV2 implements ColumnWriteStore { | ||
|
|
@@ -66,6 +68,30 @@ public ColumnWriteStoreV2( | |
| this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); | ||
| } | ||
|
|
||
| public ColumnWriteStoreV2( | ||
| MessageType schema, | ||
| PageWriteStore pageWriteStore, | ||
| BloomFilterWriteStore bloomFilterWriteStore, | ||
| ParquetProperties props) { | ||
| this.props = props; | ||
|
||
| this.thresholdTolerance = (long)(props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); | ||
| Map<ColumnDescriptor, ColumnWriterV2> mcolumns = new TreeMap<ColumnDescriptor, ColumnWriterV2>(); | ||
|
|
||
| for (ColumnDescriptor path : schema.getColumns()) { | ||
| PageWriter pageWriter = pageWriteStore.getPageWriter(path); | ||
| if (props.isBloomFilterEnabled() && props.getBloomFilterInfo() != null) { | ||
| BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); | ||
| mcolumns.put(path, new ColumnWriterV2(path, pageWriter, bloomFilterWriter, props)); | ||
| } else { | ||
| mcolumns.put(path, new ColumnWriterV2(path, pageWriter, props)); | ||
| } | ||
| } | ||
| this.columns = unmodifiableMap(mcolumns); | ||
| this.writers = this.columns.values(); | ||
|
|
||
| this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); | ||
| } | ||
|
|
||
| public ColumnWriter getColumnWriter(ColumnDescriptor path) { | ||
| return columns.get(path); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| import static org.apache.parquet.bytes.BytesInput.concat; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.HashMap; | ||
|
|
||
| import org.apache.parquet.column.ColumnDescriptor; | ||
| import org.apache.parquet.column.ColumnWriter; | ||
|
|
@@ -29,6 +30,8 @@ | |
| import org.apache.parquet.column.page.PageWriter; | ||
| import org.apache.parquet.column.statistics.Statistics; | ||
| import org.apache.parquet.column.values.ValuesWriter; | ||
| import org.apache.parquet.column.values.bloomfilter.BloomFilter; | ||
| import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; | ||
| import org.apache.parquet.io.ParquetEncodingException; | ||
| import org.apache.parquet.io.api.Binary; | ||
| import org.slf4j.Logger; | ||
|
|
@@ -55,6 +58,23 @@ final class ColumnWriterV1 implements ColumnWriter { | |
| private int valueCountForNextSizeCheck; | ||
|
|
||
| private Statistics statistics; | ||
| private BloomFilterWriter bloomFilterWriter; | ||
| private BloomFilter bloomFilter; | ||
|
|
||
| public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, | ||
| BloomFilterWriter bloomFilterWriter, ParquetProperties props) { | ||
|
||
| this(path, pageWriter, props); | ||
|
|
||
| // Current not support nested column. | ||
| if (path.getPath().length == 1) { | ||
|
||
| this.bloomFilterWriter = bloomFilterWriter; | ||
| HashMap<String, Long> bloomFilterInfo = props.getBloomFilterInfo(); | ||
| String column = path.getPath()[0]; | ||
| if (bloomFilterInfo.keySet().contains(column)) { | ||
| this.bloomFilter = new BloomFilter(bloomFilterInfo.get(column).intValue()); | ||
|
||
| } | ||
| } | ||
| } | ||
|
|
||
| public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, | ||
| ParquetProperties props) { | ||
|
|
@@ -177,6 +197,9 @@ public void write(double value, int repetitionLevel, int definitionLevel) { | |
| definitionLevelColumn.writeInteger(definitionLevel); | ||
|
||
| dataColumn.writeDouble(value); | ||
| updateStatistics(value); | ||
| if (bloomFilter != null) { | ||
| bloomFilter.insert(bloomFilter.hash(value)); | ||
| } | ||
| accountForValueWritten(); | ||
| } | ||
|
|
||
|
|
@@ -187,6 +210,9 @@ public void write(float value, int repetitionLevel, int definitionLevel) { | |
| definitionLevelColumn.writeInteger(definitionLevel); | ||
| dataColumn.writeFloat(value); | ||
| updateStatistics(value); | ||
| if (bloomFilter != null) { | ||
| bloomFilter.insert(bloomFilter.hash(value)); | ||
| } | ||
| accountForValueWritten(); | ||
| } | ||
|
|
||
|
|
@@ -197,6 +223,9 @@ public void write(Binary value, int repetitionLevel, int definitionLevel) { | |
| definitionLevelColumn.writeInteger(definitionLevel); | ||
| dataColumn.writeBytes(value); | ||
| updateStatistics(value); | ||
| if (bloomFilter != null) { | ||
| bloomFilter.insert(bloomFilter.hash(value)); | ||
| } | ||
| accountForValueWritten(); | ||
| } | ||
|
|
||
|
|
@@ -217,6 +246,9 @@ public void write(int value, int repetitionLevel, int definitionLevel) { | |
| definitionLevelColumn.writeInteger(definitionLevel); | ||
| dataColumn.writeInteger(value); | ||
| updateStatistics(value); | ||
| if (bloomFilter != null) { | ||
| bloomFilter.insert(bloomFilter.hash(value)); | ||
| } | ||
| accountForValueWritten(); | ||
| } | ||
|
|
||
|
|
@@ -227,6 +259,9 @@ public void write(long value, int repetitionLevel, int definitionLevel) { | |
| definitionLevelColumn.writeInteger(definitionLevel); | ||
| dataColumn.writeLong(value); | ||
| updateStatistics(value); | ||
| if (bloomFilter != null) { | ||
| bloomFilter.insert(bloomFilter.hash(value)); | ||
| } | ||
| accountForValueWritten(); | ||
| } | ||
|
|
||
|
|
@@ -244,6 +279,10 @@ public void flush() { | |
| } | ||
| dataColumn.resetDictionary(); | ||
| } | ||
|
|
||
| if (bloomFilterWriter != null && bloomFilter != null) { | ||
|
||
| bloomFilterWriter.writeBloomFilter(bloomFilter); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -257,17 +296,21 @@ public void close() { | |
|
|
||
| @Override | ||
| public long getBufferedSizeInMemory() { | ||
| long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); | ||
| return repetitionLevelColumn.getBufferedSize() | ||
| + definitionLevelColumn.getBufferedSize() | ||
| + dataColumn.getBufferedSize() | ||
| + pageWriter.getMemSize(); | ||
| + pageWriter.getMemSize() | ||
| + bloomBufferSize; | ||
| } | ||
|
|
||
| public long allocatedSize() { | ||
| long bloomAllocatedSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); | ||
| return repetitionLevelColumn.getAllocatedSize() | ||
| + definitionLevelColumn.getAllocatedSize() | ||
| + dataColumn.getAllocatedSize() | ||
| + pageWriter.allocatedSize(); | ||
| + definitionLevelColumn.getAllocatedSize() | ||
| + dataColumn.getAllocatedSize() | ||
| + pageWriter.allocatedSize() | ||
| + bloomAllocatedSize; | ||
| } | ||
|
|
||
| public String memUsageString(String indent) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please keep changes like this in a different patch. Every patch should have a single purpose.