diff --git a/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java b/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java index 91b90f508b402..b5105a1e802af 100644 --- a/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java +++ b/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java @@ -33,6 +33,7 @@ import com.facebook.presto.parquet.RichColumnDescriptor; import com.facebook.presto.parquet.cache.MetadataReader; import com.facebook.presto.parquet.predicate.Predicate; +import com.facebook.presto.parquet.reader.ColumnIndexFilterUtils; import com.facebook.presto.parquet.reader.ParquetReader; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorPageSource; @@ -55,6 +56,7 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.io.ColumnIO; import org.apache.parquet.io.MessageColumnIO; import org.apache.parquet.schema.GroupType; @@ -64,6 +66,7 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; @@ -164,7 +167,8 @@ public ConnectorPageSource createPageSource( isParquetBatchReaderVerificationEnabled(session), typeManager, deltaTableLayoutHandle.getPredicate(), - fileFormatDataSourceStats); + fileFormatDataSourceStats, + false); return new DeltaPageSource( deltaColumnHandles, @@ -207,7 +211,8 @@ private static ConnectorPageSource createParquetPageSource( boolean verificationEnabled, TypeManager typeManager, TupleDomain effectivePredicate, - FileFormatDataSourceStats stats) + FileFormatDataSourceStats stats, + boolean columnIndexFilterEnabled) { AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext(); @@ -243,9 +248,12 @@ private static ConnectorPageSource createParquetPageSource( Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath); final ParquetDataSource finalDataSource = dataSource; ImmutableList.Builder blocks = ImmutableList.builder(); + List blockIndexStores = new ArrayList<>(); for (BlockMetaData block : footerBlocks.build()) { - if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain)) { + Optional columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled); + if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) { blocks.add(block); + blockIndexStores.add(columnIndexStore.orElse(null)); } } MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema); @@ -256,7 +264,10 @@ private static ConnectorPageSource createParquetPageSource( systemMemoryContext, maxReadBlockSize, batchReaderEnabled, - verificationEnabled); + verificationEnabled, + parquetPredicate, + blockIndexStores, + columnIndexFilterEnabled); ImmutableList.Builder namesBuilder = ImmutableList.builder(); ImmutableList.Builder typesBuilder = ImmutableList.builder(); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java index ede8b0e5fe3f0..9747543166992 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java @@ -205,6 +205,8 @@ public class HiveClientConfig private boolean userDefinedTypeEncodingEnabled; + private boolean columnIndexFilterEnabled; + @Min(0) public int getMaxInitialSplits() { @@ -1704,6 +1706,19 @@ public int getMaterializedViewMissingPartitionsThreshold() return this.materializedViewMissingPartitionsThreshold; } + @Config("hive.parquet-column-index-filter-enabled") + @ConfigDescription("enable using parquet column index filter") + public HiveClientConfig setReadColumnIndexFilter(boolean columnIndexFilterEnabled) + { + this.columnIndexFilterEnabled = columnIndexFilterEnabled; + return this; + } + + public boolean getReadColumnIndexFilter() + { + return this.columnIndexFilterEnabled; + } + @Config("hive.size-based-split-weights-enabled") public HiveClientConfig setSizeBasedSplitWeightsEnabled(boolean sizeBasedSplitWeightsEnabled) { diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java index e51edc9f15042..8e801035051c4 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java @@ -130,6 +130,7 @@ public final class HiveSessionProperties public static final String VERBOSE_RUNTIME_STATS_ENABLED = "verbose_runtime_stats_enabled"; private static final String DWRF_WRITER_STRIPE_CACHE_ENABLED = "dwrf_writer_stripe_cache_enabled"; private static final String DWRF_WRITER_STRIPE_CACHE_SIZE = "dwrf_writer_stripe_cache_size"; + public static final String USE_COLUMN_INDEX_FILTER = "use_column_index_filter"; public static final String SIZE_BASED_SPLIT_WEIGHTS_ENABLED = "size_based_split_weights_enabled"; public static final String MINIMUM_ASSIGNED_SPLIT_WEIGHT = "minimum_assigned_split_weight"; private static final String USE_RECORD_PAGE_SOURCE_FOR_CUSTOM_SPLIT = "use_record_page_source_for_custom_split"; @@ -628,6 +629,11 @@ public HiveSessionProperties(HiveClientConfig hiveClientConfig, OrcFileWriterCon "Maximum size of DWRF stripe cache to be held in memory", orcFileWriterConfig.getDwrfStripeCacheMaxSize(), false), + booleanProperty( + USE_COLUMN_INDEX_FILTER, + "should use column index statistics filtering", + hiveClientConfig.getReadColumnIndexFilter(), + false), booleanProperty( SIZE_BASED_SPLIT_WEIGHTS_ENABLED, "Enable estimating split weights based on size in bytes", @@ -1118,6 +1124,11 @@ public static DataSize getDwrfWriterStripeCacheeMaxSize(ConnectorSession session return session.getProperty(DWRF_WRITER_STRIPE_CACHE_SIZE, DataSize.class); } + public static boolean columnIndexFilterEnabled(ConnectorSession session) + { + return session.getProperty(USE_COLUMN_INDEX_FILTER, Boolean.class); + } + public static boolean isSizeBasedSplitWeightsEnabled(ConnectorSession session) { return session.getProperty(SIZE_BASED_SPLIT_WEIGHTS_ENABLED, Boolean.class); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/HdfsParquetDataSource.java b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/HdfsParquetDataSource.java index f3b83760e1c89..642f1abc117aa 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/HdfsParquetDataSource.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/HdfsParquetDataSource.java @@ -20,9 +20,16 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.parquet.format.Util; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.hadoop.metadata.IndexReference; import java.io.FileNotFoundException; import java.io.IOException; +import java.util.Optional; import static com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; import static com.facebook.presto.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; @@ -67,6 +74,30 @@ protected void readInternal(long position, byte[] buffer, int bufferOffset, int } } + @Override + public Optional readColumnIndex(ColumnChunkMetaData column) + throws IOException + { + IndexReference indexRef = column.getColumnIndexReference(); + if (indexRef == null) { + return Optional.empty(); + } + inputStream.seek(indexRef.getOffset()); + return Optional.of(ParquetMetadataConverter.fromParquetColumnIndex(column.getPrimitiveType(), Util.readColumnIndex(inputStream))); + } + + @Override + public Optional readOffsetIndex(ColumnChunkMetaData column) + throws IOException + { + IndexReference indexRef = column.getOffsetIndexReference(); + if (indexRef == null) { + return Optional.empty(); + } + inputStream.seek(indexRef.getOffset()); + return Optional.of(ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(inputStream))); + } + public static HdfsParquetDataSource buildHdfsParquetDataSource(FileSystem fileSystem, Path path, long start, long length, FileFormatDataSourceStats stats) { try { diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java index fb1b4495f5379..00fe94538ded6 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java @@ -34,6 +34,7 @@ import com.facebook.presto.parquet.RichColumnDescriptor; import com.facebook.presto.parquet.cache.ParquetMetadataSource; import com.facebook.presto.parquet.predicate.Predicate; +import com.facebook.presto.parquet.reader.ColumnIndexFilterUtils; import com.facebook.presto.parquet.reader.ParquetReader; import com.facebook.presto.spi.ConnectorPageSource; import com.facebook.presto.spi.ConnectorSession; @@ -52,6 +53,7 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.io.ColumnIO; import org.apache.parquet.io.MessageColumnIO; import org.apache.parquet.schema.GroupType; @@ -63,6 +65,7 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Map; @@ -94,6 +97,7 @@ import static com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; import static com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA; import static com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH; +import static com.facebook.presto.hive.HiveSessionProperties.columnIndexFilterEnabled; import static com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize; import static com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled; import static com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled; @@ -186,7 +190,8 @@ public Optional createPageSource( effectivePredicate, stats, hiveFileContext, - parquetMetadataSource)); + parquetMetadataSource, + columnIndexFilterEnabled(session))); } public static ConnectorPageSource createParquetPageSource( @@ -208,7 +213,8 @@ public static ConnectorPageSource createParquetPageSource( TupleDomain effectivePredicate, FileFormatDataSourceStats stats, HiveFileContext hiveFileContext, - ParquetMetadataSource parquetMetadataSource) + ParquetMetadataSource parquetMetadataSource, + boolean columnIndexFilterEnabled) { AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext(); @@ -248,9 +254,12 @@ public static ConnectorPageSource createParquetPageSource( Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath); final ParquetDataSource finalDataSource = dataSource; ImmutableList.Builder blocks = ImmutableList.builder(); + List blockIndexStores = new ArrayList<>(); for (BlockMetaData block : footerBlocks.build()) { - if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain)) { + Optional columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled); + if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) { blocks.add(block); + blockIndexStores.add(columnIndexStore.orElse(null)); hiveFileContext.incrementCounter("parquet.blocksRead", 1); hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount()); hiveFileContext.incrementCounter("parquet.totalBytesRead", block.getTotalByteSize()); @@ -269,7 +278,10 @@ public static ConnectorPageSource createParquetPageSource( systemMemoryContext, maxReadBlockSize, batchReaderEnabled, - verificationEnabled); + verificationEnabled, + parquetPredicate, + blockIndexStores, + columnIndexFilterEnabled); ImmutableList.Builder namesBuilder = ImmutableList.builder(); ImmutableList.Builder typesBuilder = ImmutableList.builder(); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java index e3dec3b5fa76b..a33ed612c67d6 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java @@ -157,6 +157,7 @@ public void testDefaults() .setPartitionLeaseDuration(new Duration(0, TimeUnit.SECONDS)) .setMaterializedViewMissingPartitionsThreshold(100) .setLooseMemoryAccountingEnabled(false) + .setReadColumnIndexFilter(false) .setSizeBasedSplitWeightsEnabled(true) .setMinimumAssignedSplitWeight(0.05) .setUserDefinedTypeEncodingEnabled(false) @@ -278,6 +279,7 @@ public void testExplicitPropertyMappings() .put("hive.loose-memory-accounting-enabled", "true") .put("hive.verbose-runtime-stats-enabled", "true") .put("hive.materialized-view-missing-partitions-threshold", "50") + .put("hive.parquet-column-index-filter-enabled", "true") .put("hive.size-based-split-weights-enabled", "false") .put("hive.user-defined-type-encoding-enabled", "true") .put("hive.minimum-assigned-split-weight", "1.0") @@ -396,6 +398,7 @@ public void testExplicitPropertyMappings() .setPartitionLeaseDuration(new Duration(4, TimeUnit.HOURS)) .setMaterializedViewMissingPartitionsThreshold(50) .setLooseMemoryAccountingEnabled(true) + .setReadColumnIndexFilter(true) .setSizeBasedSplitWeightsEnabled(false) .setMinimumAssignedSplitWeight(1.0) .setUserDefinedTypeEncodingEnabled(true) diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/BenchmarkParquetPageSource.java b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/BenchmarkParquetPageSource.java index 5680c65d299ef..bbff04916d096 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/BenchmarkParquetPageSource.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/BenchmarkParquetPageSource.java @@ -290,8 +290,7 @@ ParquetPageSource createParquetPageSource() fields.add(ColumnIOConverter.constructField(getTypeFromTypeSignature(), messageColumnIO.getChild(i))); } - ParquetReader parquetReader = new ParquetReader(messageColumnIO, parquetMetadata.getBlocks(), dataSource, newSimpleAggregatedMemoryContext(), new DataSize(16, MEGABYTE), batchReadEnabled, enableVerification); - + ParquetReader parquetReader = new ParquetReader(messageColumnIO, parquetMetadata.getBlocks(), dataSource, newSimpleAggregatedMemoryContext(), new DataSize(16, MEGABYTE), batchReadEnabled, enableVerification, null, null, false); return new ParquetPageSource(parquetReader, Collections.nCopies(channelCount, type), fields, columnNames, new RuntimeStats()); } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java index aab7c411753ba..a06580012cee0 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java @@ -54,6 +54,7 @@ import com.facebook.presto.parquet.RichColumnDescriptor; import com.facebook.presto.parquet.cache.MetadataReader; import com.facebook.presto.parquet.predicate.Predicate; +import com.facebook.presto.parquet.reader.ColumnIndexFilterUtils; import com.facebook.presto.parquet.reader.ParquetReader; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorPageSource; @@ -78,6 +79,7 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.io.MessageColumnIO; import org.apache.parquet.schema.MessageType; @@ -231,7 +233,8 @@ private ConnectorPageSource createDataPageSource( isParquetBatchReadsEnabled(session), isParquetBatchReaderVerificationEnabled(session), predicate, - fileFormatDataSourceStats); + fileFormatDataSourceStats, + false); case ORC: OrcReaderOptions readerOptions = new OrcReaderOptions( getOrcMaxMergeDistance(session), @@ -281,7 +284,8 @@ private static ConnectorPageSource createParquetPageSource( boolean batchReaderEnabled, boolean verificationEnabled, TupleDomain effectivePredicate, - FileFormatDataSourceStats fileFormatDataSourceStats) + FileFormatDataSourceStats fileFormatDataSourceStats, + boolean columnIndexFilterEnabled) { AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext(); @@ -319,13 +323,16 @@ private static ConnectorPageSource createParquetPageSource( Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema); TupleDomain parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate); Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath); - + final ParquetDataSource finalDataSource = dataSource; List blocks = new ArrayList<>(); + List blockIndexStores = new ArrayList<>(); for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); + Optional columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled); if ((firstDataPage >= start) && (firstDataPage < (start + length)) && - predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain)) { + predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) { blocks.add(block); + blockIndexStores.add(columnIndexStore.orElse(null)); } } @@ -337,7 +344,10 @@ private static ConnectorPageSource createParquetPageSource( systemMemoryContext, maxReadBlockSize, batchReaderEnabled, - verificationEnabled); + verificationEnabled, + parquetPredicate, + blockIndexStores, + columnIndexFilterEnabled); ImmutableList.Builder namesBuilder = ImmutableList.builder(); ImmutableList.Builder prestoTypes = ImmutableList.builder(); diff --git a/presto-parquet/pom.xml b/presto-parquet/pom.xml index c6105bcd7a9c7..713f42c1cf28c 100644 --- a/presto-parquet/pom.xml +++ b/presto-parquet/pom.xml @@ -187,6 +187,13 @@ jmh-generator-annprocess test + + + junit + junit + 4.13.1 + test + diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/ColumnReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/ColumnReader.java index 43238a1b2cccb..dfde65e1ddd94 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/ColumnReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/ColumnReader.java @@ -15,12 +15,13 @@ import com.facebook.presto.parquet.reader.ColumnChunk; import com.facebook.presto.parquet.reader.PageReader; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; public interface ColumnReader { boolean isInitialized(); - void init(PageReader pageReader, Field field); + void init(PageReader pageReader, Field field, RowRanges rowRanges); void prepareNextRead(int batchSize); diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPage.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPage.java index 065684e6f91fb..27667ed7ec1ec 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPage.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPage.java @@ -13,15 +13,33 @@ */ package com.facebook.presto.parquet; +import java.util.Optional; + public abstract class DataPage extends Page { protected final int valueCount; + private final long firstRowIndex; public DataPage(int compressedSize, int uncompressedSize, int valueCount) + { + this(compressedSize, uncompressedSize, valueCount, -1); + } + + DataPage(int compressedSize, int uncompressedSize, int valueCount, long firstRowIndex) { super(compressedSize, uncompressedSize); this.valueCount = valueCount; + this.firstRowIndex = firstRowIndex; + } + + /** + * @return the index of the first row in this page if the related data is available (the optional column-index + * contains this value) + */ + public Optional getFirstRowIndex() + { + return firstRowIndex < 0 ? Optional.empty() : Optional.of(firstRowIndex); } public int getValueCount() diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV1.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV1.java index 87c9ebe70a0bf..0f172651e98f8 100755 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV1.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV1.java @@ -32,12 +32,13 @@ public DataPageV1( Slice slice, int valueCount, int uncompressedSize, + long firstRowIndex, Statistics statistics, ParquetEncoding repetitionLevelEncoding, ParquetEncoding definitionLevelEncoding, ParquetEncoding valuesEncoding) { - super(slice.length(), uncompressedSize, valueCount); + super(slice.length(), uncompressedSize, valueCount, firstRowIndex); this.slice = requireNonNull(slice, "slice is null"); this.statistics = statistics; this.repetitionLevelEncoding = repetitionLevelEncoding; @@ -82,6 +83,7 @@ public String toString() .add("valueCount", valueCount) .add("compressedSize", compressedSize) .add("uncompressedSize", uncompressedSize) + .add("firstRowIndex", getFirstRowIndex()) .toString(); } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV2.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV2.java index 7d2f4d0b934ff..569d715cacd93 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV2.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/DataPageV2.java @@ -35,6 +35,7 @@ public DataPageV2( int rowCount, int nullCount, int valueCount, + long firstRowIndex, Slice repetitionLevels, Slice definitionLevels, ParquetEncoding dataEncoding, @@ -43,7 +44,7 @@ public DataPageV2( Statistics statistics, boolean isCompressed) { - super(repetitionLevels.length() + definitionLevels.length() + slice.length(), uncompressedSize, valueCount); + super(repetitionLevels.length() + definitionLevels.length() + slice.length(), uncompressedSize, valueCount, firstRowIndex); this.rowCount = rowCount; this.nullCount = nullCount; this.repetitionLevels = requireNonNull(repetitionLevels, "repetitionLevels slice is null"); @@ -109,6 +110,7 @@ public String toString() .add("valueCount", valueCount) .add("compressedSize", compressedSize) .add("uncompressedSize", uncompressedSize) + .add("firstRowIndex", getFirstRowIndex()) .toString(); } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/FileParquetDataSource.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/FileParquetDataSource.java index 3f8f8f8f88ad5..964035c5651f4 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/FileParquetDataSource.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/FileParquetDataSource.java @@ -28,11 +28,16 @@ package com.facebook.presto.parquet; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.io.UncheckedIOException; +import java.util.Optional; public class FileParquetDataSource extends AbstractParquetDataSource @@ -64,4 +69,16 @@ protected void readInternal(long position, byte[] buffer, int bufferOffset, int throw new UncheckedIOException(e); } } + + @Override + public Optional readColumnIndex(ColumnChunkMetaData column) throws IOException + { + throw new UnsupportedOperationException(); + } + + @Override + public Optional readOffsetIndex(ColumnChunkMetaData column) throws IOException + { + throw new UnsupportedOperationException(); + } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/ParquetDataSource.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/ParquetDataSource.java index 630fea44acee7..44278d23570ad 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/ParquetDataSource.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/ParquetDataSource.java @@ -13,8 +13,13 @@ */ package com.facebook.presto.parquet; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + import java.io.Closeable; import java.io.IOException; +import java.util.Optional; public interface ParquetDataSource extends Closeable @@ -29,6 +34,10 @@ public interface ParquetDataSource void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength); + Optional readColumnIndex(ColumnChunkMetaData column) throws IOException; + + Optional readOffsetIndex(ColumnChunkMetaData column) throws IOException; + @Override default void close() throws IOException diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/AbstractNestedBatchReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/AbstractNestedBatchReader.java index 9d2df83b8f293..8e246a56a5d1c 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/AbstractNestedBatchReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/AbstractNestedBatchReader.java @@ -29,6 +29,7 @@ import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; import org.apache.parquet.Preconditions; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.ParquetDecodingException; import java.io.IOException; @@ -77,7 +78,7 @@ public boolean isInitialized() } @Override - public void init(PageReader pageReader, Field field) + public void init(PageReader pageReader, Field field, RowRanges rowRanges) { Preconditions.checkState(!isInitialized(), "already initialized"); this.pageReader = requireNonNull(pageReader, "pageReader is null"); diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/BinaryFlatBatchReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/BinaryFlatBatchReader.java index 1a409fdb164d9..585103c146083 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/BinaryFlatBatchReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/BinaryFlatBatchReader.java @@ -33,6 +33,7 @@ import com.facebook.presto.spi.PrestoException; import io.airlift.slice.Slice; import io.airlift.slice.Slices; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import java.io.IOException; import java.util.ArrayList; @@ -72,7 +73,7 @@ public boolean isInitialized() } @Override - public void init(PageReader pageReader, Field field) + public void init(PageReader pageReader, Field field, RowRanges rowRanges) { checkArgument(!isInitialized(), "Parquet batch reader already initialized"); this.pageReader = requireNonNull(pageReader, "pageReader is null"); diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/Int32FlatBatchReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/Int32FlatBatchReader.java index 084380b0d0a26..e4107cebcd741 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/Int32FlatBatchReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/batchreader/Int32FlatBatchReader.java @@ -29,6 +29,7 @@ import com.facebook.presto.parquet.reader.ColumnChunk; import com.facebook.presto.parquet.reader.PageReader; import com.facebook.presto.spi.PrestoException; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.ParquetDecodingException; import java.io.IOException; @@ -67,7 +68,7 @@ public boolean isInitialized() } @Override - public void init(PageReader pageReader, Field field) + public void init(PageReader pageReader, Field field, RowRanges rowRanges) { checkArgument(!isInitialized(), "Parquet batch reader already initialized"); this.pageReader = requireNonNull(pageReader, "pageReader is null"); diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/cache/MetadataReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/cache/MetadataReader.java index f32b094538b1b..a4742918f9a7a 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/cache/MetadataReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/cache/MetadataReader.java @@ -32,6 +32,7 @@ import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.hadoop.metadata.IndexReference; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; @@ -144,6 +145,8 @@ public static ParquetFileMetadata readFooter(ParquetDataSource parquetDataSource metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); + column.setColumnIndexReference(toColumnIndexReference(columnChunk)); + column.setOffsetIndexReference(toOffsetIndexReference(columnChunk)); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); @@ -308,4 +311,20 @@ public ParquetFileMetadata getParquetMetadata(ParquetDataSource parquetDataSourc { return readFooter(parquetDataSource, fileSize); } + + private static IndexReference toColumnIndexReference(ColumnChunk columnChunk) + { + if (columnChunk.isSetColumn_index_offset() && columnChunk.isSetColumn_index_length()) { + return new IndexReference(columnChunk.getColumn_index_offset(), columnChunk.getColumn_index_length()); + } + return null; + } + + private static IndexReference toOffsetIndexReference(ColumnChunk columnChunk) + { + if (columnChunk.isSetOffset_index_offset() && columnChunk.isSetOffset_index_length()) { + return new IndexReference(columnChunk.getOffset_index_offset(), columnChunk.getOffset_index_length()); + } + return null; + } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java index 0cf048381a71d..0fd9b0b08e8bd 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java @@ -17,8 +17,10 @@ import com.facebook.presto.parquet.ParquetDataSourceId; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import java.util.Map; +import java.util.Optional; public interface Predicate { @@ -36,6 +38,12 @@ public boolean matches(DictionaryDescriptor dictionary) { return true; } + + @Override + public boolean matches(long numberOfRows, Optional columnIndexStore) + { + return true; + } }; /** @@ -57,4 +65,13 @@ boolean matches(long numberOfRows, Map> statisti * @param dictionary The single column dictionary */ boolean matches(DictionaryDescriptor dictionary); + + /** + * Should the Parquet Reader process a file section with the specified statistics. + * + * @param numberOfRows the number of rows in the segment; this can be used with + * Statistics to determine if a column is only null + * @param columnIndexStore column index (statistics) store + */ + boolean matches(long numberOfRows, Optional columnIndexStore); } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java index 364ab2ff9c164..05e019b2894f0 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java @@ -37,6 +37,7 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.schema.MessageType; import java.io.ByteArrayInputStream; @@ -84,7 +85,7 @@ public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain< return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build()); } - public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map, RichColumnDescriptor> descriptorsByPath, TupleDomain parquetTupleDomain) + public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map, RichColumnDescriptor> descriptorsByPath, TupleDomain parquetTupleDomain, Optional columnIndexStore, boolean readColumnIndex) throws ParquetCorruptionException { Map> columnStatistics = getStatistics(block, descriptorsByPath); @@ -92,6 +93,11 @@ public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData return false; } + // Page stats is finer grained but relatively more expensive, so we do the filtering after above block filtering. + if (columnIndexStore.isPresent() && readColumnIndex && !parquetPredicate.matches(block.getRowCount(), columnIndexStore)) { + return false; + } + return dictionaryPredicatesMatch(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain); } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java index 531e5284e36bf..7cca6fd6df29c 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java @@ -30,10 +30,20 @@ import io.airlift.slice.Slices; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.filter2.predicate.FilterApi; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import java.io.Serializable; +import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -52,18 +62,25 @@ import static com.google.common.base.Preconditions.checkArgument; import static java.lang.Float.floatToRawIntBits; import static java.lang.String.format; +import static java.nio.ByteOrder.LITTLE_ENDIAN; import static java.util.Objects.requireNonNull; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; public class TupleDomainParquetPredicate implements Predicate { private final TupleDomain effectivePredicate; private final List columns; + private final ColumnIndexValueConverter converter; public TupleDomainParquetPredicate(TupleDomain effectivePredicate, List columns) { this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null"); this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null")); + this.converter = new ColumnIndexValueConverter(columns); } @Override @@ -122,6 +139,42 @@ public boolean matches(DictionaryDescriptor dictionary) return effectivePredicateDomain == null || effectivePredicateMatches(effectivePredicateDomain, dictionary); } + @Override + public boolean matches(long numberOfRows, Optional columnIndexStore) + { + if (numberOfRows == 0 || !columnIndexStore.isPresent()) { + return false; + } + + if (effectivePredicate.isNone()) { + return false; + } + Map effectivePredicateDomains = effectivePredicate.getDomains() + .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains")); + + for (RichColumnDescriptor column : columns) { + Domain effectivePredicateDomain = effectivePredicateDomains.get(column); + if (effectivePredicateDomain == null) { + continue; + } + + if (columnIndexStore.isPresent()) { + ColumnIndex columnIndex = columnIndexStore.get().getColumnIndex(ColumnPath.get(column.getPath())); + if (columnIndex == null || columnIndex.getMinValues().size() == 0 || columnIndex.getMaxValues().size() == 0 || columnIndex.getMinValues().size() != columnIndex.getMaxValues().size()) { + continue; + } + else { + Domain domain = getDomain(effectivePredicateDomain.getType(), numberOfRows, columnIndex, column); + if (effectivePredicateDomain.intersect(domain).isNone()) { + return false; + } + } + } + } + + return true; + } + private static boolean effectivePredicateMatches(Domain effectivePredicateDomain, DictionaryDescriptor dictionary) { return !effectivePredicateDomain.intersect(getDomain(effectivePredicateDomain.getType(), dictionary)).isNone(); @@ -295,6 +348,57 @@ public static Domain getDomain(Type type, DictionaryDescriptor dictionaryDescrip return getDomain(columnDescriptor, type, values, values, true); } + @VisibleForTesting + public Domain getDomain(Type type, long rowCount, ColumnIndex columnIndex, RichColumnDescriptor descriptor) + { + if (columnIndex == null) { + return Domain.all(type); + } + + String columnName = descriptor.getPrimitiveType().getName(); + if (isCorruptedColumnIndex(columnIndex)) { + return Domain.all(type); + } + + if (isEmptyColumnIndex(columnIndex)) { + return Domain.all(type); + } + + long totalNullCount = columnIndex.getNullCounts().stream().reduce(0L, (a, b) -> a + b); + if (totalNullCount == rowCount) { + return Domain.onlyNull(type); + } + + boolean hasNullValue = totalNullCount > 0; + + if (descriptor.getType().equals(PrimitiveTypeName.BOOLEAN)) { + // After row-group filtering for boolean, page filtering shouldn't do more + return Domain.all(type); + } + + if (descriptor.getType().equals(INT32) || descriptor.getType().equals(INT64) || descriptor.getType().equals(FLOAT)) { + List mins = converter.getMinValuesAsLong(type, columnIndex, columnName); + List maxs = converter.getMaxValuesAsLong(type, columnIndex, columnName); + return createDomain(type, columnIndex, hasNullValue, mins, maxs); + } + + if (descriptor.getType().equals(PrimitiveTypeName.DOUBLE)) { + List mins = converter.getMinValuesAsDouble(type, columnIndex, columnName); + List maxs = converter.getMaxValuesAsDouble(type, columnIndex, columnName); + return createDomain(type, columnIndex, hasNullValue, mins, maxs); + } + + if (descriptor.getType().equals(BINARY)) { + List mins = converter.getMinValuesAsSlice(type, columnIndex); + List maxs = converter.getMaxValuesAsSlice(type, columnIndex); + return createDomain(type, columnIndex, hasNullValue, mins, maxs); + } + + //TODO: Add INT96 and FIXED_LEN_BYTE_ARRAY later + + return Domain.create(ValueSet.all(type), hasNullValue); + } + public static long asLong(Object value) { if (value instanceof Byte || value instanceof Short || value instanceof Integer || value instanceof Long) { @@ -333,4 +437,253 @@ private Function getConverter(PrimitiveType primitiveType) } } } + + private static > Domain createDomain(Type type, ColumnIndex columnIndex, boolean hasNullValue, List mins, List maxs) + { + if (mins.size() == 0 || maxs.size() == 0 || mins.size() != maxs.size()) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + int pageCount = columnIndex.getMinValues().size(); + List ranges = new ArrayList<>(); + for (int i = 0; i < pageCount; i++) { + T min = mins.get(i); + T max = maxs.get(i); + if (min.compareTo(max) > 0) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + + if (min instanceof Long) { + if (isStatisticsOverflow(type, asLong(min), asLong(max))) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + ranges.add(Range.range(type, min, true, max, true)); + } + else if (min instanceof Double) { + if (((Double) min).isNaN() || ((Double) max).isNaN()) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + ranges.add(Range.range(type, min, true, max, true)); + } + else if (min instanceof Slice) { + ranges.add(Range.range(type, min, true, max, true)); + } + } + checkArgument(!ranges.isEmpty(), "cannot use empty ranges"); + return Domain.create(ValueSet.ofRanges(ranges), hasNullValue); + } + + private boolean isCorruptedColumnIndex(ColumnIndex columnIndex) + { + if (columnIndex.getMaxValues() == null || columnIndex.getMinValues() == null || + columnIndex.getNullCounts() == null || columnIndex.getNullPages() == null) { + return true; + } + + if (columnIndex.getMaxValues().size() != columnIndex.getMinValues().size() || + columnIndex.getMaxValues().size() != columnIndex.getNullPages().size() || + columnIndex.getMaxValues().size() != columnIndex.getNullCounts().size()) { + return true; + } + + return false; + } + + // Caller should verify isCorruptedColumnIndex is false first + private boolean isEmptyColumnIndex(ColumnIndex columnIndex) + { + return columnIndex.getMaxValues().size() == 0; + } + + public FilterPredicate getParquetUserDefinedPredicate() + { + FilterPredicate filter = null; + + // It could be a Presto bug that we don't see effectivePredicate.getDomains().get() has more than 1 domain. + // For example, the 'where c1=3 or c1=10002' clause should have two domains but it has none + // we assume the relation cross domains are 'or' + for (RichColumnDescriptor column : columns) { + Domain domain = effectivePredicate.getDomains().get().get(column); + if (domain == null || domain.isNone()) { + continue; + } + + if (domain.isAll()) { + continue; + } + + FilterPredicate columnFilter = FilterApi.userDefined(FilterApi.intColumn(ColumnPath.get(column.getPath()).toDotString()), new ParquetUserDefinedPredicateTupleDomain(domain)); + if (filter == null) { + filter = columnFilter; + } + else { + filter = FilterApi.or(filter, columnFilter); + } + } + + return filter; + } + + /** + * This class implements methods defined in UserDefinedPredicate based on the page statistic and tuple domain(for a column). + */ + static class ParquetUserDefinedPredicateTupleDomain> + extends UserDefinedPredicate + implements Serializable + { + private Domain columnDomain; + + ParquetUserDefinedPredicateTupleDomain(Domain domain) + { + this.columnDomain = domain; + } + + @Override + public boolean keep(T value) + { + if (value == null && !columnDomain.isNullAllowed()) { + return false; + } + + return true; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistic) + { + if (statistic == null) { + return false; + } + List ranges = new ArrayList<>(); + ranges.add(Range.range(columnDomain.getType(), statistic.getMin(), true, statistic.getMax(), true)); + return canDropWithRangeStatistics(ranges); + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + // !canDrop() cannot be used because it might not be correct. To be safe, we just keep the record by returning false. + // Since we don't use LogicalNotUserDefined, this method is not called. + return false; + } + + private boolean canDropWithRangeStatistics(List ranges) + { + checkArgument(!ranges.isEmpty(), "cannot use empty ranges"); + Domain domain = Domain.create(ValueSet.ofRanges(ranges), true); + return columnDomain.intersect(domain).isNone(); + } + } + + class ColumnIndexValueConverter + { + private final Map> conversions; + + private ColumnIndexValueConverter(List columns) + { + this.conversions = new HashMap<>(); + for (RichColumnDescriptor column : columns) { + conversions.put(column.getPrimitiveType().getName(), getColumnIndexConversions(column.getPrimitiveType())); + } + } + + public List getMinValuesAsLong(Type type, ColumnIndex columnIndex, String column) + { + return getValuesAsLong(type, column, columnIndex.getMinValues().size(), columnIndex.getMinValues()); + } + + public List getMaxValuesAsLong(Type type, ColumnIndex columnIndex, String column) + { + return getValuesAsLong(type, column, columnIndex.getMaxValues().size(), columnIndex.getMaxValues()); + } + + public List getMinValuesAsDouble(Type type, ColumnIndex columnIndex, String column) + { + return getValuesAsDouble(type, column, columnIndex.getMinValues().size(), columnIndex.getMinValues()); + } + + public List getMaxValuesAsDouble(Type type, ColumnIndex columnIndex, String column) + { + return getValuesAsDouble(type, column, columnIndex.getMaxValues().size(), columnIndex.getMaxValues()); + } + + public List getMinValuesAsSlice(Type type, ColumnIndex columnIndex) + { + return getValuesAsSlice(type, columnIndex.getMinValues().size(), columnIndex.getMinValues()); + } + + public List getMaxValuesAsSlice(Type type, ColumnIndex columnIndex) + { + return getValuesAsSlice(type, columnIndex.getMaxValues().size(), columnIndex.getMaxValues()); + } + + private List getValuesAsLong(Type type, String column, int pageCount, List values) + { + List result = new ArrayList<>(); + if (TINYINT.equals(type) || SMALLINT.equals(type) || INTEGER.equals(type)) { + for (int i = 0; i < pageCount; i++) { + result.add((long) converter.convert(values.get(i), column)); + } + } + else if (BIGINT.equals(type)) { + for (int i = 0; i < pageCount; i++) { + result.add((long) converter.convert(values.get(i), column)); + } + } + else if (REAL.equals(type)) { + for (int i = 0; i < pageCount; i++) { + result.add((long) floatToRawIntBits(converter.convert(values.get(i), column))); + } + } + return result; + } + + private List getValuesAsDouble(Type type, String column, int pageCount, List values) + { + List result = new ArrayList<>(); + if (DOUBLE.equals(type)) { + for (int i = 0; i < pageCount; i++) { + result.add(converter.convert(values.get(i), column)); + } + } + return result; + } + + private List getValuesAsSlice(Type type, int pageCount, List values) + { + List result = new ArrayList<>(); + if (isVarcharType(type)) { + for (int i = 0; i < pageCount; i++) { + result.add(Slices.wrappedBuffer(values.get(i))); + } + } + return result; + } + + private T convert(ByteBuffer buffer, String name) + { + return (T) conversions.get(name).apply(buffer); + } + + private Function getColumnIndexConversions(PrimitiveType type) + { + switch (type.getPrimitiveTypeName()) { + case BOOLEAN: + return buffer -> ((ByteBuffer) buffer).get(0) != 0; + case INT32: + return buffer -> ((ByteBuffer) buffer).order(LITTLE_ENDIAN).getInt(0); + case INT64: + return buffer -> ((ByteBuffer) buffer).order(LITTLE_ENDIAN).getLong(0); + case FLOAT: + return buffer -> ((ByteBuffer) buffer).order(LITTLE_ENDIAN).getFloat(0); + case DOUBLE: + return buffer -> ((ByteBuffer) buffer).order(LITTLE_ENDIAN).getDouble(0); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + return binary -> ByteBuffer.wrap(((Binary) binary).getBytes()); + default: + throw new IllegalArgumentException("Unsupported Parquet type: " + type.getPrimitiveTypeName()); + } + } + } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/AbstractColumnReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/AbstractColumnReader.java index 9cd2515be3881..305029d9d1e98 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/AbstractColumnReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/AbstractColumnReader.java @@ -33,9 +33,11 @@ import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.ParquetDecodingException; import java.io.IOException; +import java.util.PrimitiveIterator; import java.util.function.Consumer; import static com.facebook.presto.parquet.ValuesType.DEFINITION_LEVEL; @@ -66,6 +68,9 @@ public abstract class AbstractColumnReader private DataPage page; private int remainingValueCountInPage; private int readOffset; + private PrimitiveIterator.OfLong indexIterator; + private long currentRow; + private long targetRow; protected abstract void readValue(BlockBuilder blockBuilder, Type type); @@ -80,6 +85,8 @@ public AbstractColumnReader(RichColumnDescriptor columnDescriptor) { this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor"); pageReader = null; + this.targetRow = Long.MIN_VALUE; + this.indexIterator = null; } @Override @@ -89,7 +96,7 @@ public boolean isInitialized() } @Override - public void init(PageReader pageReader, Field field) + public void init(PageReader pageReader, Field field, RowRanges rowRanges) { this.pageReader = requireNonNull(pageReader, "pageReader is null"); this.field = requireNonNull(field, "field is null"); @@ -108,6 +115,7 @@ public void init(PageReader pageReader, Field field) } checkArgument(pageReader.getTotalValueCount() > 0, "page is empty"); totalValueCount = pageReader.getTotalValueCount(); + indexIterator = (rowRanges == null) ? null : rowRanges.iterator(); } @Override @@ -130,10 +138,13 @@ public ColumnChunk readNext() readNextPage(); } int valuesToRead = Math.min(remainingValueCountInPage, nextBatchSize - valueCount); + if (valuesToRead == 0) { + // When we break here, we could end up with valueCount < nextBatchSize, this is because we may skip reading values in readValues() + break; + } readValues(blockBuilder, valuesToRead, field.getType(), definitionLevels, repetitionLevels); valueCount += valuesToRead; } - checkArgument(valueCount == nextBatchSize, "valueCount %s not equals to batchSize %s", valueCount, nextBatchSize); readOffset = 0; nextBatchSize = 0; @@ -146,38 +157,81 @@ private void readValues(BlockBuilder blockBuilder, int valuesToRead, Type type, readValue(blockBuilder, type); definitionLevels.add(definitionLevel); repetitionLevels.add(repetitionLevel); - }); + }, indexIterator != null); } private void skipValues(int valuesToRead) { - processValues(valuesToRead, ignored -> skipValue()); + processValues(valuesToRead, ignored -> skipValue(), false); } - private void processValues(int valuesToRead, Consumer valueConsumer) + /** + * When filtering using column indexes we might skip reading some pages for different columns. Because the rows are + * not aligned between the pages of the different columns it might be required to skip some values. The values (and the + * related rl and dl) are skipped based on the iterator of the required row indexes and the first row index of each + * page. + * For example: + * + * rows col1 col2 col3 + * ┌──────┬──────┬──────┐ + * 0 │ p0 │ │ │ + * ╞══════╡ p0 │ p0 │ + * 20 │ p1(X)│------│------│ + * ╞══════╪══════╡ │ + * 40 │ p2(X)│ │------│ + * ╞══════╡ p1(X)╞══════╡ + * 60 │ p3(X)│ │------│ + * ╞══════╪══════╡ │ + * 80 │ p4 │ │ p1 │ + * ╞══════╡ p2 │ │ + * 100 │ p5 │ │ │ + * └──────┴──────┴──────┘ + * + * The pages 1, 2, 3 in col1 are skipped so we have to skip the rows [20, 79]. Because page 1 in col2 contains values + * only for the rows [40, 79] we skip this entire page as well. To synchronize the row reading we have to skip the + * values (and the related rl and dl) for the rows [20, 39] in the end of the page 0 for col2. Similarly, we have to + * skip values while reading page0 and page1 for col3. + */ + private void processValues(int valuesToRead, Consumer valueConsumer, boolean indexEnabled) { if (definitionLevel == EMPTY_LEVEL_VALUE && repetitionLevel == EMPTY_LEVEL_VALUE) { definitionLevel = definitionReader.readLevel(); repetitionLevel = repetitionReader.readLevel(); } int valueCount = 0; - for (int i = 0; i < valuesToRead; i++) { + int skipCount = 0; + for (int i = 0; i < valuesToRead; ) { + boolean consumed = false; do { - valueConsumer.accept(null); - valueCount++; - if (valueCount == remainingValueCountInPage) { - updateValueCounts(valueCount); + if (skipRL(repetitionLevel, indexEnabled)) { + skipValue(); + skipCount++; + } + else { + valueConsumer.accept(null); + valueCount++; + consumed = true; + } + + if (valueCount + skipCount == remainingValueCountInPage) { + updateValueCounts(valueCount, skipCount); if (!readNextPage()) { return; } valueCount = 0; + skipCount = 0; } + repetitionLevel = repetitionReader.readLevel(); definitionLevel = definitionReader.readLevel(); } while (repetitionLevel != 0); + + if (consumed) { + i++; + } } - updateValueCounts(valueCount); + updateValueCounts(valueCount, skipCount); } private void seek() @@ -216,13 +270,14 @@ private boolean readNextPage() return true; } - private void updateValueCounts(int valuesRead) + private void updateValueCounts(int valuesRead, int skipCount) { - if (valuesRead == remainingValueCountInPage) { + int totalCount = valuesRead + skipCount; + if (totalCount == remainingValueCountInPage) { page = null; valuesReader = null; } - remainingValueCountInPage -= valuesRead; + remainingValueCountInPage -= totalCount; currentValueCount += valuesRead; } @@ -236,7 +291,8 @@ private ValuesReader readPageV1(DataPageV1 page) ByteBufferInputStream bufferInputStream = ByteBufferInputStream.wrap(page.getSlice().toByteBuffer()); repetitionLevelReader.initFromPage(page.getValueCount(), bufferInputStream); definitionLevelReader.initFromPage(page.getValueCount(), bufferInputStream); - return initDataReader(page.getValueEncoding(), bufferInputStream, page.getValueCount()); + long firstRowIndex = page.getFirstRowIndex().orElse(-1L); + return initDataReader(page.getValueEncoding(), bufferInputStream, page.getValueCount(), firstRowIndex); } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e); @@ -247,7 +303,8 @@ private ValuesReader readPageV2(DataPageV2 page) { repetitionReader = buildLevelRLEReader(columnDescriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); definitionReader = buildLevelRLEReader(columnDescriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); - return initDataReader(page.getDataEncoding(), ByteBufferInputStream.wrap(ImmutableList.of(page.getSlice().toByteBuffer())), page.getValueCount()); + long firstRowIndex = page.getFirstRowIndex().orElse(-1L); + return initDataReader(page.getDataEncoding(), ByteBufferInputStream.wrap(ImmutableList.of(page.getSlice().toByteBuffer())), page.getValueCount(), firstRowIndex); } private LevelReader buildLevelRLEReader(int maxLevel, Slice slice) @@ -259,7 +316,7 @@ private LevelReader buildLevelRLEReader(int maxLevel, Slice slice) return new LevelRLEReader(new RunLengthBitPackingHybridDecoder(BytesUtils.getWidthFromMaxInt(maxLevel), slice.getInput())); } - private ValuesReader initDataReader(ParquetEncoding dataEncoding, ByteBufferInputStream inputStream, int valueCount) + private ValuesReader initDataReader(ParquetEncoding dataEncoding, ByteBufferInputStream inputStream, int valueCount, long firstRowIndex) { ValuesReader valuesReader; if (dataEncoding.usesDictionary()) { @@ -274,10 +331,32 @@ private ValuesReader initDataReader(ParquetEncoding dataEncoding, ByteBufferInpu try { valuesReader.initFromPage(valueCount, inputStream); + if (firstRowIndex != -1) { + currentRow = firstRowIndex - 1; + } + else { + currentRow = -1; + } return valuesReader; } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e); } } + + private boolean skipRL(int repetitionLevel, boolean indexEnabled) + { + if (!indexEnabled || indexIterator == null) { + return false; + } + + if (repetitionLevel == 0) { + currentRow = currentRow + 1; + if (currentRow > targetRow) { + targetRow = indexIterator.hasNext() ? indexIterator.nextLong() : Long.MAX_VALUE; + } + } + + return currentRow < targetRow; + } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ColumnIndexFilterUtils.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ColumnIndexFilterUtils.java new file mode 100644 index 0000000000000..431baa2a584a7 --- /dev/null +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ColumnIndexFilterUtils.java @@ -0,0 +1,208 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.parquet.reader; + +import com.facebook.presto.parquet.ParquetDataSource; +import com.facebook.presto.parquet.RichColumnDescriptor; +import com.facebook.presto.parquet.predicate.Predicate; +import com.facebook.presto.parquet.predicate.TupleDomainParquetPredicate; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Formatter; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +public class ColumnIndexFilterUtils +{ + private ColumnIndexFilterUtils() {} + + static class OffsetRange + { + private final long offset; + private long length; + + public OffsetRange(long offset, int length) + { + this.offset = offset; + this.length = length; + } + + long getOffset() + { + return offset; + } + + long getLength() + { + return length; + } + + private boolean extendWithCheck(long offset, int length) + { + if (this.offset + this.length == offset) { + this.length += length; + return true; + } + else { + return false; + } + } + + public void extendLength(long length) + { + this.length += length; + } + + public long endPos() + { + return offset + length; + } + } + + private static class FilteredOffsetIndex + implements OffsetIndex + { + private final OffsetIndex offsetIndex; + private final int[] indices; + + private FilteredOffsetIndex(OffsetIndex offsetIndex, int[] indices) + { + this.offsetIndex = offsetIndex; + this.indices = indices; + } + + @Override + public int getPageCount() + { + return indices.length; + } + + @Override + public long getOffset(int pageIndex) + { + return offsetIndex.getOffset(indices[pageIndex]); + } + + @Override + public int getCompressedPageSize(int pageIndex) + { + return offsetIndex.getCompressedPageSize(indices[pageIndex]); + } + + @Override + public long getFirstRowIndex(int pageIndex) + { + return offsetIndex.getFirstRowIndex(indices[pageIndex]); + } + + @Override + public long getLastRowIndex(int pageIndex, long totalRowCount) + { + int nextIndex = indices[pageIndex] + 1; + return (nextIndex >= offsetIndex.getPageCount() ? totalRowCount : offsetIndex.getFirstRowIndex(nextIndex)) - 1; + } + + @Override + public String toString() + { + try (Formatter formatter = new Formatter()) { + formatter.format("%-12s %20s %16s %20s\n", "", "offset", "compressed size", "first row index"); + for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { + int index = Arrays.binarySearch(indices, i); + boolean isHidden = index < 0; + formatter.format("%spage-%-5d %20d %16d %20d\n", + isHidden ? "- " : " ", + isHidden ? i : index, + offsetIndex.getOffset(i), + offsetIndex.getCompressedPageSize(i), + offsetIndex.getFirstRowIndex(i)); + } + return formatter.toString(); + } + } + } + + /* + * Returns the filtered offset index containing only the pages which are overlapping with rowRanges. + */ + static OffsetIndex filterOffsetIndex(OffsetIndex offsetIndex, RowRanges rowRanges, long totalRowCount) + { + IntList indices = new IntArrayList(); + for (int i = 0; i < offsetIndex.getPageCount(); i++) { + long from = offsetIndex.getFirstRowIndex(i); + if (rowRanges.isOverlapping(from, offsetIndex.getLastRowIndex(i, totalRowCount))) { + indices.add(i); + } + } + return new FilteredOffsetIndex(offsetIndex, indices.toIntArray()); + } + + static List calculateOffsetRanges(OffsetIndex offsetIndex, ColumnChunkMetaData columnChunkMetadata, long firstPageOffset, long startingPosition) + { + List ranges = new ArrayList<>(); + int pageCount = offsetIndex.getPageCount(); + if (pageCount > 0) { + OffsetRange currentRange = null; + + // Add a range for the dictionary page if required + long rowGroupOffset = columnChunkMetadata.getStartingPos(); + if (rowGroupOffset < firstPageOffset) { + // We need to adjust the offset by startingPosition for presto because dataSource.readFully() started at startingPosition + currentRange = new OffsetRange(rowGroupOffset - startingPosition, (int) (firstPageOffset - rowGroupOffset)); + ranges.add(currentRange); + } + + for (int i = 0; i < pageCount; i++) { + long offset = offsetIndex.getOffset(i); + int length = offsetIndex.getCompressedPageSize(i); + // We need to adjust the offset by startingPosition for presto because dataSource.readFully() started at startingPosition + if (currentRange == null || !currentRange.extendWithCheck(offset - startingPosition, length)) { + currentRange = new OffsetRange(offset - startingPosition, length); + ranges.add(currentRange); + } + } + } + return ranges; + } + + public static Optional getColumnIndexStore(Predicate parquetPredicate, ParquetDataSource dataSource, BlockMetaData blockMetadata, Map, RichColumnDescriptor> descriptorsByPath, boolean columnIndexFilterEnabled) + { + if (!columnIndexFilterEnabled || parquetPredicate == null || !(parquetPredicate instanceof TupleDomainParquetPredicate)) { + return Optional.empty(); + } + + for (ColumnChunkMetaData column : blockMetadata.getColumns()) { + if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) { + Set paths = new HashSet<>(); + for (List path : descriptorsByPath.keySet()) { + paths.add(ColumnPath.get(path.toArray(new String[0]))); + } + return Optional.of(ParquetColumnIndexStore.create(dataSource, blockMetadata, paths)); + } + } + return Optional.empty(); + } +} diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/PageReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/PageReader.java index 82e76aa4af135..c0a0b2e51ede2 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/PageReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/PageReader.java @@ -17,7 +17,9 @@ import com.facebook.presto.parquet.DataPageV1; import com.facebook.presto.parquet.DataPageV2; import com.facebook.presto.parquet.DictionaryPage; +import io.airlift.slice.Slice; import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; import java.io.IOException; import java.util.LinkedList; @@ -31,6 +33,8 @@ public class PageReader private final long valueCount; private final LinkedList compressedPages; private final DictionaryPage compressedDictionaryPage; + private final OffsetIndex offsetIndex; + private int pageIndex; /** * @param compressedPages This parameter will be mutated destructively as {@link DataPage} entries are removed as part of {@link #readPage()}. The caller @@ -38,14 +42,27 @@ public class PageReader */ public PageReader(CompressionCodecName codec, LinkedList compressedPages, - DictionaryPage compressedDictionaryPage, - long valueCount) + DictionaryPage compressedDictionaryPage) throws IOException + { + this(codec, compressedPages, compressedDictionaryPage, null); + } + + public PageReader(CompressionCodecName codec, + LinkedList compressedPages, + DictionaryPage compressedDictionaryPage, + OffsetIndex offsetIndex) { this.codec = codec; this.compressedPages = compressedPages; this.compressedDictionaryPage = compressedDictionaryPage; - this.valueCount = valueCount; + int count = 0; + for (DataPage page : compressedPages) { + count += page.getValueCount(); + } + this.valueCount = count; + this.offsetIndex = offsetIndex; + this.pageIndex = 0; } public long getTotalValueCount() @@ -60,12 +77,16 @@ public DataPage readPage() } DataPage compressedPage = compressedPages.removeFirst(); try { + long firstRowIndex = getFirstRowIndex(pageIndex, offsetIndex); + pageIndex = pageIndex + 1; if (compressedPage instanceof DataPageV1) { DataPageV1 dataPageV1 = (DataPageV1) compressedPage; + Slice slice = decompress(codec, dataPageV1.getSlice(), dataPageV1.getUncompressedSize()); return new DataPageV1( - decompress(codec, dataPageV1.getSlice(), dataPageV1.getUncompressedSize()), + slice, dataPageV1.getValueCount(), dataPageV1.getUncompressedSize(), + firstRowIndex, dataPageV1.getStatistics(), dataPageV1.getRepetitionLevelEncoding(), dataPageV1.getDefinitionLevelEncoding(), @@ -79,14 +100,16 @@ public DataPage readPage() int uncompressedSize = toIntExact(dataPageV2.getUncompressedSize() - dataPageV2.getDefinitionLevels().length() - dataPageV2.getRepetitionLevels().length()); + Slice slice = decompress(codec, dataPageV2.getSlice(), uncompressedSize); return new DataPageV2( dataPageV2.getRowCount(), dataPageV2.getNullCount(), dataPageV2.getValueCount(), + firstRowIndex, dataPageV2.getRepetitionLevels(), dataPageV2.getDefinitionLevels(), dataPageV2.getDataEncoding(), - decompress(codec, dataPageV2.getSlice(), uncompressedSize), + slice, dataPageV2.getUncompressedSize(), dataPageV2.getStatistics(), false); @@ -112,4 +135,9 @@ public DictionaryPage readDictionaryPage() throw new RuntimeException("Error reading dictionary page", e); } } + + public static long getFirstRowIndex(int pageIndex, OffsetIndex offsetIndex) + { + return offsetIndex == null ? -1 : offsetIndex.getFirstRowIndex(pageIndex); + } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetColumnChunk.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetColumnChunk.java index 6fe85c97f975d..1ebe589f8148c 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetColumnChunk.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetColumnChunk.java @@ -20,15 +20,17 @@ import com.facebook.presto.parquet.ParquetCorruptionException; import com.facebook.presto.parquet.cache.MetadataReader; import io.airlift.slice.Slice; +import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.column.Encoding; import org.apache.parquet.format.DataPageHeader; import org.apache.parquet.format.DataPageHeaderV2; import org.apache.parquet.format.DictionaryPageHeader; import org.apache.parquet.format.PageHeader; import org.apache.parquet.format.Util; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import java.io.ByteArrayInputStream; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.LinkedList; import java.util.List; @@ -36,18 +38,29 @@ import static io.airlift.slice.Slices.wrappedBuffer; public class ParquetColumnChunk - extends ByteArrayInputStream { private final ColumnChunkDescriptor descriptor; + private final ByteBufferInputStream stream; + private final OffsetIndex offsetIndex; public ParquetColumnChunk( ColumnChunkDescriptor descriptor, byte[] data, int offset) { - super(data); + this.stream = ByteBufferInputStream.wrap(ByteBuffer.wrap(data, offset, data.length - offset)); this.descriptor = descriptor; - this.pos = offset; + this.offsetIndex = null; + } + + public ParquetColumnChunk( + ColumnChunkDescriptor descriptor, + List data, + OffsetIndex offsetIndex) + { + this.stream = ByteBufferInputStream.wrap(data); + this.descriptor = descriptor; + this.offsetIndex = offsetIndex; } public ColumnChunkDescriptor getDescriptor() @@ -58,7 +71,7 @@ public ColumnChunkDescriptor getDescriptor() protected PageHeader readPageHeader() throws IOException { - return Util.readPageHeader(this); + return Util.readPageHeader(stream); } public PageReader readAllPages() @@ -67,10 +80,12 @@ public PageReader readAllPages() LinkedList pages = new LinkedList<>(); DictionaryPage dictionaryPage = null; long valueCount = 0; - while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) { + int dataPageCount = 0; + while (hasMorePages(valueCount, dataPageCount)) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); + long firstRowIndex = -1; switch (pageHeader.type) { case DICTIONARY_PAGE: if (dictionaryPage != null) { @@ -79,32 +94,36 @@ public PageReader readAllPages() dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize); break; case DATA_PAGE: - valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages); + firstRowIndex = PageReader.getFirstRowIndex(dataPageCount, offsetIndex); + valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, firstRowIndex, pages); + dataPageCount = dataPageCount + 1; break; case DATA_PAGE_V2: - valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages); + firstRowIndex = PageReader.getFirstRowIndex(dataPageCount, offsetIndex); + valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, firstRowIndex, pages); + dataPageCount = dataPageCount + 1; break; default: - skip(compressedPageSize); + stream.skipFully(compressedPageSize); break; } } - return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage, valueCount); + return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage, offsetIndex); } - public int getPosition() + private Slice getSlice(int size) throws IOException { - return pos; - } - - private Slice getSlice(int size) - { - Slice slice = wrappedBuffer(buf, pos, size); - pos += size; - return slice; + //Todo: 1) The stream.slice() in both MultiBufferInputStream and SingleBufferInputStream will clone the memory. + // Need to check how much the memory consumption goes up. Since we skip reading pages, that would reduce + // a lot of memory consumption and compensate. + // 2) It adds exception IOException. It is OK because eventually it rewinds to readAllPages() which + // already has IOException + ByteBuffer buffer = stream.slice(size); + return wrappedBuffer(buffer.array(), buffer.position(), size); } private DictionaryPage readDictionaryPage(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize) + throws IOException { DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); return new DictionaryPage( @@ -117,13 +136,16 @@ private DictionaryPage readDictionaryPage(PageHeader pageHeader, int uncompresse private long readDataPageV1(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize, + long firstRowIndex, List pages) + throws IOException { DataPageHeader dataHeaderV1 = pageHeader.getData_page_header(); pages.add(new DataPageV1( getSlice(compressedPageSize), dataHeaderV1.getNum_values(), uncompressedPageSize, + firstRowIndex, MetadataReader.readStats( dataHeaderV1.getStatistics(), descriptor.getColumnDescriptor().getType()), @@ -136,7 +158,9 @@ private long readDataPageV1(PageHeader pageHeader, private long readDataPageV2(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize, + long firstRowIndex, List pages) + throws IOException { DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2(); int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length(); @@ -144,6 +168,7 @@ private long readDataPageV2(PageHeader pageHeader, dataHeaderV2.getNum_rows(), dataHeaderV2.getNum_nulls(), dataHeaderV2.getNum_values(), + firstRowIndex, getSlice(dataHeaderV2.getRepetition_levels_byte_length()), getSlice(dataHeaderV2.getDefinition_levels_byte_length()), getParquetEncoding(Encoding.valueOf(dataHeaderV2.getEncoding().name())), @@ -155,4 +180,10 @@ private long readDataPageV2(PageHeader pageHeader, dataHeaderV2.isIs_compressed())); return dataHeaderV2.getNum_values(); } + + private boolean hasMorePages(long valuesCount, int pagesCount) + { + return offsetIndex == null ? valuesCount < descriptor.getColumnChunkMetaData().getValueCount() + : pagesCount < offsetIndex.getPageCount(); + } } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetColumnIndexStore.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetColumnIndexStore.java new file mode 100644 index 0000000000000..08ced0b03e266 --- /dev/null +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetColumnIndexStore.java @@ -0,0 +1,160 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.parquet.reader; + +import com.facebook.presto.parquet.ParquetDataSource; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static java.util.Collections.emptySet; + +/** + * Internal implementation of {@link ColumnIndexStore}. + */ +public class ParquetColumnIndexStore + implements ColumnIndexStore +{ + private interface IndexStore + { + Optional getColumnIndex(); + + Optional getOffsetIndex(); + } + + private class PageIndexStore + implements IndexStore + { + private final ColumnChunkMetaData columnChunkMetadata; + private Optional columnIndex; + private boolean columnIndexRead; + private final Optional offsetIndex; + + PageIndexStore(ColumnChunkMetaData meta) + { + this.columnChunkMetadata = meta; + try { + this.offsetIndex = dataSource.readOffsetIndex(meta); + } + catch (IOException e) { + // If the I/O issue still stands it will fail the reading later; + // otherwise we fail the filtering only with a missing offset index. + throw new MissingOffsetIndexException(meta.getPath()); + } + } + + @Override + public Optional getColumnIndex() + { + if (!columnIndexRead) { + try { + columnIndex = dataSource.readColumnIndex(columnChunkMetadata); + } + catch (IOException e) { + // If the I/O issue still stands it will fail the reading later; + // otherwise we fail the filtering only with a missing column index. + } + columnIndexRead = true; + } + return columnIndex; + } + + @Override + public Optional getOffsetIndex() + { + return offsetIndex; + } + } + + private static final ParquetColumnIndexStore.IndexStore MISSING_INDEX_STORE = new IndexStore() + { + @Override + public Optional getColumnIndex() + { + return null; + } + + @Override + public Optional getOffsetIndex() + { + return null; + } + }; + + private static final ParquetColumnIndexStore EMPTY = new ParquetColumnIndexStore(null, new BlockMetaData(), emptySet()) + { + @Override + public ColumnIndex getColumnIndex(ColumnPath column) + { + return null; + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) + { + throw new MissingOffsetIndexException(column); + } + }; + + private final ParquetDataSource dataSource; + private final Map store; + + /* + * Creates a column index store which lazily reads column/offset indexes for the columns in paths. (paths are the set + * of columns used for the projection) + */ + public static ColumnIndexStore create(ParquetDataSource dataSource, BlockMetaData block, Set paths) + { + try { + return new ParquetColumnIndexStore(dataSource, block, paths); + } + catch (MissingOffsetIndexException e) { + return EMPTY; + } + } + + private ParquetColumnIndexStore(ParquetDataSource dataSource, BlockMetaData block, Set paths) + { + this.dataSource = dataSource; + Map store = new HashMap<>(); + for (ColumnChunkMetaData column : block.getColumns()) { + ColumnPath path = column.getPath(); + if (paths.contains(path)) { + store.put(path, new ParquetColumnIndexStore.PageIndexStore(column)); + } + } + this.store = store; + } + + @Override + public ColumnIndex getColumnIndex(ColumnPath column) + { + return store.getOrDefault(column, MISSING_INDEX_STORE).getColumnIndex().orElse(null); + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) + { + return store.getOrDefault(column, MISSING_INDEX_STORE).getOffsetIndex().orElse(null); + } +} diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetReader.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetReader.java index 02722e33f4d7b..53296b9575961 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetReader.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/reader/ParquetReader.java @@ -34,24 +34,39 @@ import com.facebook.presto.parquet.ParquetResultVerifierUtils; import com.facebook.presto.parquet.PrimitiveField; import com.facebook.presto.parquet.RichColumnDescriptor; +import com.facebook.presto.parquet.predicate.Predicate; +import com.facebook.presto.parquet.predicate.TupleDomainParquetPredicate; +import com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange; import io.airlift.units.DataSize; import it.unimi.dsi.fastutil.booleans.BooleanArrayList; import it.unimi.dsi.fastutil.booleans.BooleanList; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.MessageColumnIO; import org.apache.parquet.io.PrimitiveColumnIO; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import java.io.Closeable; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.IntegerType.INTEGER; @@ -81,11 +96,13 @@ public class ParquetReader private final AggregatedMemoryContext systemMemoryContext; private final boolean batchReadEnabled; private final boolean enableVerification; + private final FilterPredicate filter; private int currentBlock; private BlockMetaData currentBlockMetadata; private long currentPosition; private long currentGroupRowCount; + private RowRanges currentGroupRowRanges; private long nextRowInGroup; private int batchSize; @@ -99,6 +116,12 @@ public class ParquetReader private AggregatedMemoryContext currentRowGroupMemoryContext; + private final List blockIndexStores; + private final List blockRowRanges; + private final Map paths = new HashMap<>(); + + private final boolean columnIndexFilterEnabled; + public ParquetReader(MessageColumnIO messageColumnIO, List blocks, @@ -106,7 +129,10 @@ public ParquetReader(MessageColumnIO AggregatedMemoryContext systemMemoryContext, DataSize maxReadBlockSize, boolean batchReadEnabled, - boolean enableVerification) + boolean enableVerification, + Predicate parquetPredicate, + List blockIndexStores, + boolean columnIndexFilterEnabled) { this.blocks = blocks; this.dataSource = requireNonNull(dataSource, "dataSource is null"); @@ -119,6 +145,20 @@ public ParquetReader(MessageColumnIO this.enableVerification = enableVerification; verificationColumnReaders = enableVerification ? new ColumnReader[columns.size()] : null; maxBytesPerCell = new long[columns.size()]; + this.blockIndexStores = blockIndexStores; + this.blockRowRanges = listWithNulls(this.blocks.size()); + for (PrimitiveColumnIO column : columns) { + ColumnDescriptor columnDescriptor = column.getColumnDescriptor(); + this.paths.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor); + } + if (parquetPredicate != null && columnIndexFilterEnabled && parquetPredicate instanceof TupleDomainParquetPredicate) { + this.filter = ((TupleDomainParquetPredicate) parquetPredicate).getParquetUserDefinedPredicate(); + } + else { + this.filter = null; + } + this.currentBlock = -1; + this.columnIndexFilterEnabled = columnIndexFilterEnabled; } @Override @@ -159,6 +199,7 @@ public int nextBatch() private boolean advanceToNextRowGroup() { + currentBlock++; currentRowGroupMemoryContext.close(); currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext(); @@ -166,7 +207,17 @@ private boolean advanceToNextRowGroup() return false; } currentBlockMetadata = blocks.get(currentBlock); - currentBlock = currentBlock + 1; + + if (filter != null && columnIndexFilterEnabled) { + ColumnIndexStore columnIndexStore = blockIndexStores.get(currentBlock); + if (columnIndexStore != null) { + currentGroupRowRanges = getRowRanges(currentBlock); + long rowCount = currentGroupRowRanges.rowCount(); + if (rowCount == 0) { + return false; + } + } + } nextRowInGroup = 0L; currentGroupRowCount = currentBlockMetadata.getRowCount(); @@ -243,16 +294,37 @@ private ColumnChunk readPrimitive(PrimitiveField field) ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor); long startingPosition = metadata.getStartingPos(); int totalSize = toIntExact(metadata.getTotalSize()); - byte[] buffer = allocateBlock(totalSize); - dataSource.readFully(startingPosition, buffer); - ColumnChunkDescriptor descriptor = new ColumnChunkDescriptor(columnDescriptor, metadata, totalSize); - ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0); - columnReader.init(columnChunk.readAllPages(), field); - if (enableVerification) { - ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()]; - ParquetColumnChunk columnChunkVerfication = new ParquetColumnChunk(descriptor, buffer, 0); - verificationColumnReader.init(columnChunkVerfication.readAllPages(), field); + if (shouldUseColumnIndex(metadata.getPath())) { + OffsetIndex offsetIndex = blockIndexStores.get(currentBlock).getOffsetIndex(metadata.getPath()); + OffsetIndex filteredOffsetIndex = ColumnIndexFilterUtils.filterOffsetIndex(offsetIndex, currentGroupRowRanges, blocks.get(currentBlock).getRowCount()); + List offsetRanges = ColumnIndexFilterUtils.calculateOffsetRanges(filteredOffsetIndex, metadata, offsetIndex.getOffset(0), startingPosition); + List consecutiveRanges = concatRanges(offsetRanges); + List buffers = allocateBlocks(consecutiveRanges); + for (int i = 0; i < consecutiveRanges.size(); i++) { + ByteBuffer buffer = buffers.get(i); + dataSource.readFully(startingPosition + consecutiveRanges.get(i).getOffset(), buffer.array()); + } + PageReader pageReader = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex); + columnReader.init(pageReader, field, currentGroupRowRanges); + + if (enableVerification) { + ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()]; + PageReader pageReaderVerification = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex); + verificationColumnReader.init(pageReaderVerification, field, currentGroupRowRanges); + } + } + else { + byte[] buffer = allocateBlock(totalSize); + dataSource.readFully(startingPosition, buffer); + PageReader pageReader = createPageReader(buffer, totalSize, metadata, columnDescriptor); + columnReader.init(pageReader, field, null); + + if (enableVerification) { + ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()]; + PageReader pageReaderVerification = createPageReader(buffer, totalSize, metadata, columnDescriptor); + verificationColumnReader.init(pageReaderVerification, field, null); + } } } @@ -276,9 +348,44 @@ private ColumnChunk readPrimitive(PrimitiveField field) return columnChunk; } - private byte[] allocateBlock(int length) + private boolean shouldUseColumnIndex(ColumnPath path) + { + return filter != null && + columnIndexFilterEnabled && + currentGroupRowRanges != null && + currentGroupRowRanges.rowCount() < currentGroupRowCount && + blockIndexStores.get(currentBlock) != null && + blockIndexStores.get(currentBlock).getColumnIndex(path) != null; + } + + private List allocateBlocks(List pageRanges) + { + List buffers = new ArrayList<>(); + for (OffsetRange pageRange : pageRanges) { + buffers.add(ByteBuffer.wrap(allocateBlock(pageRange.getLength()))); + } + return buffers; + } + + protected PageReader createPageReader(List buffers, int bufferSize, ColumnChunkMetaData metadata, ColumnDescriptor columnDescriptor, OffsetIndex offsetIndex) + throws IOException + { + ColumnChunkDescriptor descriptor = new ColumnChunkDescriptor(columnDescriptor, metadata, bufferSize); + ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffers, offsetIndex); + return columnChunk.readAllPages(); + } + + protected PageReader createPageReader(byte[] buffer, int bufferSize, ColumnChunkMetaData metadata, ColumnDescriptor columnDescriptor) + throws IOException + { + ColumnChunkDescriptor descriptor = new ColumnChunkDescriptor(columnDescriptor, metadata, bufferSize); + ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0); + return columnChunk.readAllPages(); + } + + protected byte[] allocateBlock(long length) { - byte[] buffer = new byte[length]; + byte[] buffer = new byte[toIntExact(length)]; LocalMemoryContext blockMemoryContext = currentRowGroupMemoryContext.newLocalMemoryContext(ParquetReader.class.getSimpleName()); blockMemoryContext.setBytes(buffer.length); return buffer; @@ -408,4 +515,38 @@ private static Block rewriteLongArrayBlock(LongArrayBlock longArrayBlock, Type t return newBlockBuilder.build(); } + + private static List listWithNulls(int size) + { + return Stream.generate(() -> (T) null).limit(size).collect(Collectors.toCollection(ArrayList::new)); + } + + private RowRanges getRowRanges(int blockIndex) + { + assert filter != null; + + RowRanges rowRanges = blockRowRanges.get(blockIndex); + if (rowRanges == null) { + rowRanges = ColumnIndexFilter.calculateRowRanges(FilterCompat.get(filter), blockIndexStores.get(blockIndex), + paths.keySet(), blocks.get(blockIndex).getRowCount()); + blockRowRanges.set(blockIndex, rowRanges); + } + return rowRanges; + } + + private List concatRanges(List offsetRanges) + { + List pageRanges = new ArrayList<>(); + OffsetRange currentParts = null; + for (OffsetRange range : offsetRanges) { + long startPosition = range.getOffset(); + // first part or not consecutive => new list + if (currentParts == null || currentParts.endPos() != startPosition) { + currentParts = new OffsetRange(startPosition, 0); + } + pageRanges.add(currentParts); + currentParts.extendLength(range.getLength()); + } + return pageRanges; + } } diff --git a/presto-parquet/src/test/java/com/facebook/presto/parquet/BenchmarkParquetReader.java b/presto-parquet/src/test/java/com/facebook/presto/parquet/BenchmarkParquetReader.java index d6291b601e4ff..8e0032bdff183 100644 --- a/presto-parquet/src/test/java/com/facebook/presto/parquet/BenchmarkParquetReader.java +++ b/presto-parquet/src/test/java/com/facebook/presto/parquet/BenchmarkParquetReader.java @@ -278,7 +278,7 @@ ParquetReader createRecordReader() this.field = ColumnIOConverter.constructField(getType(), messageColumnIO.getChild(0)).get(); - return new ParquetReader(messageColumnIO, parquetMetadata.getBlocks(), dataSource, newSimpleAggregatedMemoryContext(), new DataSize(16, MEGABYTE), enableOptimizedReader, enableVerification); + return new ParquetReader(messageColumnIO, parquetMetadata.getBlocks(), dataSource, newSimpleAggregatedMemoryContext(), new DataSize(16, MEGABYTE), enableOptimizedReader, enableVerification, null, null, false); } protected boolean getNullability() diff --git a/presto-parquet/src/test/java/com/facebook/presto/parquet/reader/TestColumnIndexBuilder.java b/presto-parquet/src/test/java/com/facebook/presto/parquet/reader/TestColumnIndexBuilder.java new file mode 100644 index 0000000000000..28cc2c80fa6a3 --- /dev/null +++ b/presto-parquet/src/test/java/com/facebook/presto/parquet/reader/TestColumnIndexBuilder.java @@ -0,0 +1,1637 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.parquet.reader; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.filter2.predicate.Operators; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.internal.column.columnindex.BoundaryOrder; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.PrimitiveIterator; + +import static java.util.Arrays.asList; +import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.booleanColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.eq; +import static org.apache.parquet.filter2.predicate.FilterApi.floatColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.gt; +import static org.apache.parquet.filter2.predicate.FilterApi.gtEq; +import static org.apache.parquet.filter2.predicate.FilterApi.intColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.longColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.lt; +import static org.apache.parquet.filter2.predicate.FilterApi.ltEq; +import static org.apache.parquet.filter2.predicate.FilterApi.notEq; +import static org.apache.parquet.filter2.predicate.FilterApi.userDefined; +import static org.apache.parquet.filter2.predicate.LogicalInverter.invert; +import static org.apache.parquet.schema.OriginalType.DECIMAL; +import static org.apache.parquet.schema.OriginalType.UINT_8; +import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +// import static org.hamcrest.CoreMatchers.instanceOf; +// import static org.junit.Assert.assertThat; + +public class TestColumnIndexBuilder +{ + public static class BinaryDecimalIsNullOrZeroUdp + extends UserDefinedPredicate + { + private static final Binary ZERO = decimalBinary("0.0"); + + @Override + public boolean keep(Binary value) + { + return value == null || value.equals(ZERO); + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), ZERO) > 0 || cmp.compare(statistics.getMax(), ZERO) < 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), ZERO) == 0 && cmp.compare(statistics.getMax(), ZERO) == 0; + } + } + + public static class BinaryUtf8StartsWithB + extends UserDefinedPredicate + { + private static final Binary B = stringBinary("B"); + private static final Binary C = stringBinary("C"); + + @Override + public boolean keep(Binary value) + { + return value != null && value.length() > 0 && value.getBytesUnsafe()[0] == 'B'; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), C) >= 0 || cmp.compare(statistics.getMax(), B) < 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), B) >= 0 && cmp.compare(statistics.getMax(), C) < 0; + } + } + + public static class BooleanIsTrueOrNull + extends UserDefinedPredicate + { + @Override + public boolean keep(Boolean value) + { + return value == null || value; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + return statistics.getComparator().compare(statistics.getMax(), true) != 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + return statistics.getComparator().compare(statistics.getMin(), true) == 0; + } + } + + public static class DoubleIsInteger + extends UserDefinedPredicate + { + @Override + public boolean keep(Double value) + { + return value != null && Math.floor(value) == value; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + double min = statistics.getMin(); + double max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(Math.floor(min), Math.floor(max)) == 0 && cmp.compare(Math.floor(min), min) != 0 + && cmp.compare(Math.floor(max), max) != 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + double min = statistics.getMin(); + double max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(min, max) == 0 && cmp.compare(Math.floor(min), min) == 0; + } + } + + public static class FloatIsInteger + extends UserDefinedPredicate + { + private static float floor(float value) + { + return (float) Math.floor(value); + } + + @Override + public boolean keep(Float value) + { + return value != null && Math.floor(value) == value; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + float min = statistics.getMin(); + float max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(floor(min), floor(max)) == 0 && cmp.compare(floor(min), min) != 0 + && cmp.compare(floor(max), max) != 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + float min = statistics.getMin(); + float max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(min, max) == 0 && cmp.compare(floor(min), min) == 0; + } + } + + public static class IntegerIsDivisableWith3 + extends UserDefinedPredicate + { + @Override + public boolean keep(Integer value) + { + return value != null && value % 3 == 0; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + int min = statistics.getMin(); + int max = statistics.getMax(); + return min % 3 != 0 && max % 3 != 0 && max - min < 3; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + int min = statistics.getMin(); + int max = statistics.getMax(); + return min == max && min % 3 == 0; + } + } + + public static class LongIsDivisableWith3 + extends UserDefinedPredicate + { + @Override + public boolean keep(Long value) + { + return value != null && value % 3 == 0; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + long min = statistics.getMin(); + long max = statistics.getMax(); + return min % 3 != 0 && max % 3 != 0 && max - min < 3; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + long min = statistics.getMin(); + long max = statistics.getMax(); + return min == max && min % 3 == 0; + } + } + + @Test + public void testBuildBinaryDecimal() + { + PrimitiveType type = Types.required(BINARY).as(DECIMAL).precision(12).scale(2).named("test_binary_decimal"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(BinaryColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.BinaryColumn col = binaryColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("-0.17"), decimalBinary("1234567890.12"))); + builder.add(sb.stats(type, decimalBinary("-234.23"), null, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, decimalBinary("-9999293.23"), decimalBinary("2348978.45"))); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("87656273"))); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 3, 3, 0, 4, 2, 0); + assertCorrectNullPages(columnIndex, true, false, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + decimalBinary("1234567890.12"), + decimalBinary("-234.23"), + null, + decimalBinary("2348978.45"), + null, + null, + decimalBinary("87656273")); + assertCorrectValues(columnIndex.getMinValues(), + null, + decimalBinary("-0.17"), + decimalBinary("-234.23"), + null, + decimalBinary("-9999293.23"), + null, + null, + decimalBinary("87656273")); + assertCorrectFiltering(columnIndex, eq(col, decimalBinary("0.0")), 1, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, decimalBinary("87656273")), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 4, 7); + assertCorrectFiltering(columnIndex, gt(col, decimalBinary("2348978.45")), 1); + assertCorrectFiltering(columnIndex, gtEq(col, decimalBinary("2348978.45")), 1, 4); + assertCorrectFiltering(columnIndex, lt(col, decimalBinary("-234.23")), 4); + assertCorrectFiltering(columnIndex, ltEq(col, decimalBinary("-234.23")), 2, 4); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 1, 2, 4, 7); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, decimalBinary("-9999293.23"), decimalBinary("-234.23"))); + builder.add(sb.stats(type, decimalBinary("-0.17"), decimalBinary("87656273"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("87656273"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("1234567890.12"), null, null, null)); + builder.add(sb.stats(type, null, null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 4, 0, 0, 2, 0, 2, 3, 3); + assertCorrectNullPages(columnIndex, true, false, false, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), + null, + decimalBinary("-234.23"), + decimalBinary("87656273"), + null, + decimalBinary("87656273"), + null, + decimalBinary("1234567890.12"), + null); + assertCorrectValues(columnIndex.getMinValues(), + null, + decimalBinary("-9999293.23"), + decimalBinary("-0.17"), + null, + decimalBinary("87656273"), + null, + decimalBinary("1234567890.12"), + null); + assertCorrectFiltering(columnIndex, eq(col, decimalBinary("87656273")), 2, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, decimalBinary("87656273")), 0, 1, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 4, 6); + assertCorrectFiltering(columnIndex, gt(col, decimalBinary("87656273")), 6); + assertCorrectFiltering(columnIndex, gtEq(col, decimalBinary("87656273")), 2, 4, 6); + assertCorrectFiltering(columnIndex, lt(col, decimalBinary("-0.17")), 1); + assertCorrectFiltering(columnIndex, ltEq(col, decimalBinary("-0.17")), 1, 2); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 1, 2, 4, 6); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("1234567890.12"), null, null, null)); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, decimalBinary("1234567890.12"), decimalBinary("87656273"))); + builder.add(sb.stats(type, decimalBinary("987656273"), decimalBinary("-0.17"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("-234.23"), decimalBinary("-9999293.23"))); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 3, 2, 3, 4, 0, 0, 2, 0); + assertCorrectNullPages(columnIndex, true, true, false, true, false, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + null, + decimalBinary("1234567890.12"), + null, + decimalBinary("1234567890.12"), + decimalBinary("987656273"), + null, + decimalBinary("-234.23")); + assertCorrectValues(columnIndex.getMinValues(), + null, + null, + decimalBinary("1234567890.12"), + null, + decimalBinary("87656273"), + decimalBinary("-0.17"), + null, + decimalBinary("-9999293.23")); + assertCorrectFiltering(columnIndex, eq(col, decimalBinary("1234567890.12")), 2, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 6); + assertCorrectFiltering(columnIndex, notEq(col, decimalBinary("0.0")), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 2, 4, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, decimalBinary("1234567890.12"))); + assertCorrectFiltering(columnIndex, gtEq(col, decimalBinary("1234567890.12")), 2, 4); + assertCorrectFiltering(columnIndex, lt(col, decimalBinary("-0.17")), 7); + assertCorrectFiltering(columnIndex, ltEq(col, decimalBinary("-0.17")), 5, 7); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 1, 2, 3, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 2, 4, 5, 7); + } + + @Test + public void testBuildBinaryUtf8() + { + PrimitiveType type = Types.required(BINARY).as(UTF8).named("test_binary_utf8"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(BinaryColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.BinaryColumn col = binaryColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Jeltz"), stringBinary("Slartibartfast"), null, null)); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Prefect"))); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Trilian"), null)); + builder.add(sb.stats(type, stringBinary("Beeblebrox"))); + builder.add(sb.stats(type, null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 5, 2, 0, 1, 0, 2); + assertCorrectNullPages(columnIndex, true, false, true, true, false, false, false, true); + assertCorrectValues(columnIndex.getMaxValues(), + null, + stringBinary("Slartibartfast"), + null, + null, + stringBinary("Prefect"), + stringBinary("Trilian"), + stringBinary("Beeblebrox"), + null); + assertCorrectValues(columnIndex.getMinValues(), + null, + stringBinary("Jeltz"), + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + stringBinary("Beeblebrox"), + null); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 1, 4, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Beeblebrox")), 0, 1, 2, 3, 4, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5, 6); + assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1, 5); + assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 4, 6); + assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 4, 5, 6); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 4, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 7); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Dent"), null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Jeltz"))); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Prefect"), null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Slartibartfast"))); + builder.add(sb.stats(type, null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 5, 0, 1, 2, 0, 2); + assertCorrectNullPages(columnIndex, false, true, true, false, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), + stringBinary("Dent"), + null, + null, + stringBinary("Jeltz"), + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast"), + null); + assertCorrectValues(columnIndex.getMinValues(), + stringBinary("Beeblebrox"), + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Slartibartfast"), + null); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Jeltz")), 3, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 4, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Slartibartfast")), 0, 1, 2, 3, 4, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 3, 4, 6); + assertCorrectFiltering(columnIndex, gt(col, stringBinary("Marvin")), 4, 6); + assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Marvin")), 4, 6); + assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 0); + assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 0, 3, 4); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 0); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Slartibartfast"))); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, stringBinary("Prefect"), stringBinary("Jeltz"), null)); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Dent"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Beeblebrox"), null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 5, 1, 0, 2, 2, 2); + assertCorrectNullPages(columnIndex, true, false, true, false, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + stringBinary("Slartibartfast"), + null, + stringBinary("Prefect"), + stringBinary("Dent"), + null, + null, + stringBinary("Dent")); + assertCorrectValues(columnIndex.getMinValues(), + null, + stringBinary("Slartibartfast"), + null, + stringBinary("Jeltz"), + stringBinary("Dent"), + null, + null, + stringBinary("Beeblebrox")); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 3); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Dent")), 0, 1, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 4, 7); + assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1); + assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 3); + assertCorrectFiltering(columnIndex, lt(col, stringBinary("Marvin")), 3, 4, 7); + assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Marvin")), 3, 4, 7); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7); + } + + @Test + public void testStaticBuildBinary() + { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(BINARY).as(UTF8).named("test_binary_utf8"), + BoundaryOrder.ASCENDING, + asList(true, true, false, false, true, false, true, false), + asList(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), + toBBList( + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")), + toBBList( + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast"))); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectNullPages(columnIndex, true, true, false, false, true, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast")); + assertCorrectValues(columnIndex.getMinValues(), + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")); + } + + @Test + public void testFilterWithoutNullCounts() + { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(BINARY).as(UTF8).named("test_binary_utf8"), + BoundaryOrder.ASCENDING, + asList(true, true, false, false, true, false, true, false), + null, + toBBList( + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")), + toBBList( + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast"))); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertNull(columnIndex.getNullCounts()); + assertCorrectNullPages(columnIndex, true, true, false, false, true, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast")); + assertCorrectValues(columnIndex.getMinValues(), + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")); + + Operators.BinaryColumn col = binaryColumn("test_col"); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Dent")), 2, 3); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Dent")), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 2, 3, 5, 7); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 2, 3, 5, 7); + } + + @Test + public void testBuildBoolean() + { + PrimitiveType type = Types.required(BOOLEAN).named("test_boolean"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(BooleanColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.BooleanColumn col = booleanColumn("test_col"); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, false, true)); + builder.add(sb.stats(type, true, false, null)); + builder.add(sb.stats(type, true, true, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, false, false)); + assertEquals(5, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), true, true, true, null, false); + assertCorrectValues(columnIndex.getMinValues(), false, false, true, null, false); + assertCorrectFiltering(columnIndex, eq(col, true), 0, 1, 2); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, true), 0, 1, 2, 3, 4); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4); + assertCorrectFiltering(columnIndex, userDefined(col, BooleanIsTrueOrNull.class), 0, 1, 2, 3); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BooleanIsTrueOrNull.class)), 0, 1, 4); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, false, false)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, false, true, null)); + builder.add(sb.stats(type, false, true, null, null)); + builder.add(sb.stats(type, null, null, null)); + assertEquals(7, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 3, 4, 1, 2, 3); + assertCorrectNullPages(columnIndex, true, false, true, true, false, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, false, null, null, true, true, null); + assertCorrectValues(columnIndex.getMinValues(), null, false, null, null, false, false, null); + assertCorrectFiltering(columnIndex, eq(col, true), 4, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, true), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, BooleanIsTrueOrNull.class), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BooleanIsTrueOrNull.class)), 1, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, true, true)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, true, false, null)); + builder.add(sb.stats(type, false, false, null, null)); + builder.add(sb.stats(type, null, null, null)); + assertEquals(7, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 3, 4, 1, 2, 3); + assertCorrectNullPages(columnIndex, true, false, true, true, false, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, true, null, null, true, false, null); + assertCorrectValues(columnIndex.getMinValues(), null, true, null, null, false, false, null); + assertCorrectFiltering(columnIndex, eq(col, true), 1, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, true), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, BooleanIsTrueOrNull.class), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BooleanIsTrueOrNull.class)), 4, 5); + } + + @Test + public void testStaticBuildBoolean() + { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(BOOLEAN).named("test_boolean"), + BoundaryOrder.DESCENDING, + asList(false, true, false, true, false, true), + asList(9L, 8L, 7L, 6L, 5L, 0L), + toBBList(false, null, false, null, true, null), + toBBList(true, null, false, null, true, null)); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 0); + assertCorrectNullPages(columnIndex, false, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), true, null, false, null, true, null); + assertCorrectValues(columnIndex.getMinValues(), false, null, false, null, true, null); + } + + @Test + public void testBuildDouble() + { + PrimitiveType type = Types.required(DOUBLE).named("test_double"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(DoubleColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.DoubleColumn col = doubleColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4.2, -4.1)); + builder.add(sb.stats(type, -11.7, 7.0, null)); + builder.add(sb.stats(type, 2.2, 2.2, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1.9, 2.32)); + builder.add(sb.stats(type, -21.0, 8.1)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), -4.1, 7.0, 2.2, null, 2.32, 8.1); + assertCorrectValues(columnIndex.getMinValues(), -4.2, -11.7, 2.2, null, 1.9, -21.0); + assertCorrectFiltering(columnIndex, eq(col, 0.0), 1, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2.2), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2.2), 1, 4, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2.2), 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, -4.2), 1, 5); + assertCorrectFiltering(columnIndex, ltEq(col, -4.2), 0, 1, 5); + assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 4, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532.3, -345.2, null, null)); + builder.add(sb.stats(type, -234.7, -234.6, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -234.6, 2.99999)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 3.0, 42.83)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345.2, -234.6, null, null, 2.99999, null, 42.83, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532.3, -234.7, null, null, -234.6, null, 3.0, null); + assertCorrectFiltering(columnIndex, eq(col, 0.0), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 0.0), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2.99999), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 2.99999), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, -234.6), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, -234.6), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532.3, 345.2)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234.7, 234.6, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 234.69, -2.99999)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3.0, -42.83)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532.3, null, 234.7, null, 234.69, null, null, -3.0); + assertCorrectValues(columnIndex.getMinValues(), null, 345.2, null, 234.6, null, -2.99999, null, null, -42.83); + assertCorrectFiltering(columnIndex, eq(col, 234.6), 3, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 2.2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2.2), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 234.69), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, -2.99999), 8); + assertCorrectFiltering(columnIndex, ltEq(col, -2.99999), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + } + + @Test + public void testBuildDoubleZeroNaN() + { + PrimitiveType type = Types.required(DOUBLE).named("test_double"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -1.0, -0.0)); + builder.add(sb.stats(type, 0.0, 1.0)); + builder.add(sb.stats(type, 1.0, 100.0)); + ColumnIndex columnIndex = builder.build(); + assertCorrectValues(columnIndex.getMinValues(), -1.0, -0.0, 1.0); + assertCorrectValues(columnIndex.getMaxValues(), 0.0, 1.0, 100.0); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + builder.add(sb.stats(type, -1.0, -0.0)); + builder.add(sb.stats(type, 0.0, Double.NaN)); + builder.add(sb.stats(type, 1.0, 100.0)); + assertNull(builder.build()); + } + + @Test + public void testStaticBuildDouble() + { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(DOUBLE).named("test_double"), + BoundaryOrder.UNORDERED, + asList(false, false, false, false, false, false), + asList(0L, 1L, 2L, 3L, 4L, 5L), + toBBList(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0), + toBBList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 4, 5); + assertCorrectNullPages(columnIndex, false, false, false, false, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0); + assertCorrectValues(columnIndex.getMinValues(), -1.0, -2.0, -3.0, -4.0, -5.0, -6.0); + } + + @Test + public void testBuildFloat() + { + PrimitiveType type = Types.required(FLOAT).named("test_float"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(FloatColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.FloatColumn col = floatColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4.2f, -4.1f)); + builder.add(sb.stats(type, -11.7f, 7.0f, null)); + builder.add(sb.stats(type, 2.2f, 2.2f, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1.9f, 2.32f)); + builder.add(sb.stats(type, -21.0f, 8.1f)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), -4.1f, 7.0f, 2.2f, null, 2.32f, 8.1f); + assertCorrectValues(columnIndex.getMinValues(), -4.2f, -11.7f, 2.2f, null, 1.9f, -21.0f); + assertCorrectFiltering(columnIndex, eq(col, 0.0f), 1, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2.2f), 1, 4, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2.2f), 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, 0.0f), 0, 1, 5); + assertCorrectFiltering(columnIndex, ltEq(col, 1.9f), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 4, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532.3f, -345.2f, null, null)); + builder.add(sb.stats(type, -300.6f, -234.7f, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -234.6f, 2.99999f)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 3.0f, 42.83f)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345.2f, -234.7f, null, null, 2.99999f, null, 42.83f, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532.3f, -300.6f, null, null, -234.6f, null, 3.0f, null); + assertCorrectFiltering(columnIndex, eq(col, 0.0f), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2.2f), 5, 7); + assertCorrectFiltering(columnIndex, gtEq(col, -234.7f), 2, 5, 7); + assertCorrectFiltering(columnIndex, lt(col, -234.6f), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, -234.6f), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532.3f, 345.2f)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234.7f, 234.6f, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 234.6f, -2.99999f)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3.0f, -42.83f)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532.3f, null, 234.7f, null, 234.6f, null, null, -3.0f); + assertCorrectValues(columnIndex.getMinValues(), null, 345.2f, null, 234.6f, null, -2.99999f, null, null, -42.83f); + assertCorrectFiltering(columnIndex, eq(col, 234.65f), 3); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2.2f), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2.2f), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, 0.0f), 5, 8); + assertCorrectFiltering(columnIndex, ltEq(col, 0.0f), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + } + + @Test + public void testBuildFloatZeroNaN() + { + PrimitiveType type = Types.required(FLOAT).named("test_float"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -1.0f, -0.0f)); + builder.add(sb.stats(type, 0.0f, 1.0f)); + builder.add(sb.stats(type, 1.0f, 100.0f)); + ColumnIndex columnIndex = builder.build(); + assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f); + assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + builder.add(sb.stats(type, -1.0f, -0.0f)); + builder.add(sb.stats(type, 0.0f, Float.NaN)); + builder.add(sb.stats(type, 1.0f, 100.0f)); + assertNull(builder.build()); + } + + @Test + public void testStaticBuildFloat() + { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(FLOAT).named("test_float"), + BoundaryOrder.ASCENDING, + asList(true, true, true, false, false, false), + asList(9L, 8L, 7L, 6L, 0L, 0L), + toBBList(null, null, null, -3.0f, -2.0f, 0.1f), + toBBList(null, null, null, -2.0f, 0.0f, 6.0f)); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 0, 0); + assertCorrectNullPages(columnIndex, true, true, true, false, false, false); + assertCorrectValues(columnIndex.getMaxValues(), null, null, null, -2.0f, 0.0f, 6.0f); + assertCorrectValues(columnIndex.getMinValues(), null, null, null, -3.0f, -2.0f, 0.1f); + } + + @Test + public void testBuildInt32() + { + PrimitiveType type = Types.required(INT32).named("test_int32"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(IntColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.IntColumn col = intColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4, 10)); + builder.add(sb.stats(type, -11, 7, null)); + builder.add(sb.stats(type, 2, 2, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1, 2)); + builder.add(sb.stats(type, -21, 8)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 10, 7, 2, null, 2, 8); + assertCorrectValues(columnIndex.getMinValues(), -4, -11, 2, null, 1, -21); + assertCorrectFiltering(columnIndex, eq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2), 0, 1, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, 2), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, ltEq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 0, 1, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532, -345, null, null)); + builder.add(sb.stats(type, -500, -42, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -42, 2)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 3, 42)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345, -42, null, null, 2, null, 42, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532, -500, null, null, -42, null, 3, null); + assertCorrectFiltering(columnIndex, eq(col, 2), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, 2), 1, 2, 5); + assertCorrectFiltering(columnIndex, ltEq(col, 2), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532, 345)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234, 42, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 42, -2)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3, -42)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532, null, 234, null, 42, null, null, -3); + assertCorrectValues(columnIndex.getMinValues(), null, 345, null, 42, null, -2, null, null, -42); + assertCorrectFiltering(columnIndex, eq(col, 2), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, 2), 5, 8); + assertCorrectFiltering(columnIndex, ltEq(col, 2), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + } + + @Test + public void testStaticBuildInt32() + { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(INT32).named("test_int32"), + BoundaryOrder.DESCENDING, + asList(false, false, false, true, true, true), + asList(0L, 10L, 0L, 3L, 5L, 7L), + toBBList(10, 8, 6, null, null, null), + toBBList(9, 7, 5, null, null, null)); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 10, 0, 3, 5, 7); + assertCorrectNullPages(columnIndex, false, false, false, true, true, true); + assertCorrectValues(columnIndex.getMaxValues(), 9, 7, 5, null, null, null); + assertCorrectValues(columnIndex.getMinValues(), 10, 8, 6, null, null, null); + } + + @Test + public void testBuildUInt8() + { + PrimitiveType type = Types.required(INT32).as(UINT_8).named("test_uint8"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(IntColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.IntColumn col = intColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, 4, 10)); + builder.add(sb.stats(type, 11, 17, null)); + builder.add(sb.stats(type, 2, 2, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1, 0xFF)); + builder.add(sb.stats(type, 0xEF, 0xFA)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 10, 17, 2, null, 0xFF, 0xFA); + assertCorrectValues(columnIndex.getMinValues(), 4, 11, 2, null, 1, 0xEF); + assertCorrectFiltering(columnIndex, eq(col, 2), 2, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, 0xEF), 0, 1, 2, 4); + assertCorrectFiltering(columnIndex, ltEq(col, 0xEF), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 0, 0, null, null)); + builder.add(sb.stats(type, 0, 42, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 42, 0xEE)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 0xEF, 0xFF)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, 0, 42, null, null, 0xEE, null, 0xFF, null); + assertCorrectValues(columnIndex.getMinValues(), null, 0, 0, null, null, 42, null, 0xEF, null); + assertCorrectFiltering(columnIndex, eq(col, 2), 2); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 0xEE), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 0xEE), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, 42), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, 42), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 0xFF, 0xFF)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 0xEF, 0xEA, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 0xEE, 42)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 41, 0)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 0xFF, null, 0xEF, null, 0xEE, null, null, 41); + assertCorrectValues(columnIndex.getMinValues(), null, 0xFF, null, 0xEA, null, 42, null, null, 0); + assertCorrectFiltering(columnIndex, eq(col, 0xAB), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 0xFF), 0, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 0xFF)); + assertCorrectFiltering(columnIndex, gtEq(col, 0xFF), 1); + assertCorrectFiltering(columnIndex, lt(col, 42), 8); + assertCorrectFiltering(columnIndex, ltEq(col, 42), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 2, 3, 4, 5, 6, 7, + 8); + } + + @Test + public void testBuildInt64() + { + PrimitiveType type = Types.required(INT64).named("test_int64"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + //assertThat(builder, instanceOf(LongColumnIndexBuilder.class)); + assertNull(builder.build()); + Operators.LongColumn col = longColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4L, 10L)); + builder.add(sb.stats(type, -11L, 7L, null)); + builder.add(sb.stats(type, 2L, 2L, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1L, 2L)); + builder.add(sb.stats(type, -21L, 8L)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0L, 1L, 2L, 3L, 0L, 0L); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 10L, 7L, 2L, null, 2L, 8L); + assertCorrectValues(columnIndex.getMinValues(), -4L, -11L, 2L, null, 1L, -21L); + assertCorrectFiltering(columnIndex, eq(col, 0L), 0, 1, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 0L), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2L), 0, 1, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2L), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, -21L)); + assertCorrectFiltering(columnIndex, ltEq(col, -21L), 5); + assertCorrectFiltering(columnIndex, userDefined(col, LongIsDivisableWith3.class), 0, 1, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, LongIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532L, -345L, null, null)); + builder.add(sb.stats(type, -234L, -42L, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -42L, 2L)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3L, 42L)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345L, -42L, null, null, 2L, null, 42L, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532L, -234L, null, null, -42L, null, -3L, null); + assertCorrectFiltering(columnIndex, eq(col, -42L), 2, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, -42L), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2L), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 2L), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, -42L), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, -42L), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, LongIsDivisableWith3.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, LongIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532L, 345L)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234L, 42L, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 42L, -2L)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3L, -42L)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532L, null, 234L, null, 42L, null, null, -3L); + assertCorrectValues(columnIndex.getMinValues(), null, 345L, null, 42L, null, -2L, null, null, -42L); + assertCorrectFiltering(columnIndex, eq(col, 0L), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 0L), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2L), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2L), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, -42L)); + assertCorrectFiltering(columnIndex, ltEq(col, -42L), 8); + assertCorrectFiltering(columnIndex, userDefined(col, LongIsDivisableWith3.class), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, LongIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + } + + @Test + public void testStaticBuildInt64() + { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(INT64).named("test_int64"), + BoundaryOrder.UNORDERED, + asList(true, false, true, false, true, false), + asList(1L, 2L, 3L, 4L, 5L, 6L), + toBBList(null, 2L, null, 4L, null, 9L), + toBBList(null, 3L, null, 15L, null, 10L)); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 1, 2, 3, 4, 5, 6); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 3L, null, 15L, null, 10L); + assertCorrectValues(columnIndex.getMinValues(), null, 2L, null, 4L, null, 9L); + } + + @Test + public void testNoOpBuilder() + { + ColumnIndexBuilder builder = ColumnIndexBuilder.getNoOpBuilder(); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(Types.required(BINARY).as(UTF8).named("test_binary_utf8"), stringBinary("Jeltz"), + stringBinary("Slartibartfast"), null, null)); + builder.add(sb.stats(Types.required(BOOLEAN).named("test_boolean"), true, true, null, null)); + builder.add(sb.stats(Types.required(DOUBLE).named("test_double"), null, null, null)); + builder.add(sb.stats(Types.required(INT32).named("test_int32"), null, null)); + builder.add(sb.stats(Types.required(INT64).named("test_int64"), -234L, -42L, null)); + assertEquals(0, builder.getPageCount()); + assertEquals(0, builder.getMinMaxSize()); + assertNull(builder.build()); + } + + private static List toBBList(Binary... values) + { + List buffers = new ArrayList<>(values.length); + for (Binary value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } + else { + buffers.add(value.toByteBuffer()); + } + } + return buffers; + } + + private static List toBBList(Boolean... values) + { + List buffers = new ArrayList<>(values.length); + for (Boolean value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } + else { + buffers.add(ByteBuffer.wrap(BytesUtils.booleanToBytes(value))); + } + } + return buffers; + } + + private static List toBBList(Double... values) + { + List buffers = new ArrayList<>(values.length); + for (Double value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } + else { + buffers.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(value)))); + } + } + return buffers; + } + + private static List toBBList(Float... values) + { + List buffers = new ArrayList<>(values.length); + for (Float value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } + else { + buffers.add(ByteBuffer.wrap(BytesUtils.intToBytes(Float.floatToIntBits(value)))); + } + } + return buffers; + } + + private static List toBBList(Integer... values) + { + List buffers = new ArrayList<>(values.length); + for (Integer value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } + else { + buffers.add(ByteBuffer.wrap(BytesUtils.intToBytes(value))); + } + } + return buffers; + } + + private static List toBBList(Long... values) + { + List buffers = new ArrayList<>(values.length); + for (Long value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } + else { + buffers.add(ByteBuffer.wrap(BytesUtils.longToBytes(value))); + } + } + return buffers; + } + + private static Binary decimalBinary(String num) + { + return Binary.fromConstantByteArray(new BigDecimal(num).unscaledValue().toByteArray()); + } + + private static Binary stringBinary(String str) + { + return Binary.fromString(str); + } + + private static void assertCorrectValues(List values, Binary... expectedValues) + { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Binary expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } + else { + assertArrayEquals("Invalid value for page " + i, expectedValue.getBytesUnsafe(), value.array()); + } + } + } + + private static void assertCorrectValues(List values, Boolean... expectedValues) + { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Boolean expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } + else { + assertEquals("The byte buffer should be 1 byte long for boolean", 1, value.remaining()); + assertEquals("Invalid value for page " + i, expectedValue.booleanValue(), value.get(0) != 0); + } + } + } + + private static void assertCorrectValues(List values, Double... expectedValues) + { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Double expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } + else { + assertEquals("The byte buffer should be 8 bytes long for double", 8, value.remaining()); + assertTrue("Invalid value for page " + i, Double.compare(expectedValue.doubleValue(), value.getDouble(0)) == 0); + } + } + } + + private static void assertCorrectValues(List values, Float... expectedValues) + { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Float expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } + else { + assertEquals("The byte buffer should be 4 bytes long for double", 4, value.remaining()); + assertTrue("Invalid value for page " + i, Float.compare(expectedValue.floatValue(), value.getFloat(0)) == 0); + } + } + } + + private static void assertCorrectValues(List values, Integer... expectedValues) + { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Integer expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } + else { + assertEquals("The byte buffer should be 4 bytes long for int32", 4, value.remaining()); + assertEquals("Invalid value for page " + i, expectedValue.intValue(), value.getInt(0)); + } + } + } + + private static void assertCorrectValues(List values, Long... expectedValues) + { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Long expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } + else { + assertEquals("The byte buffer should be 8 bytes long for int64", 8, value.remaining()); + assertEquals("Invalid value for page " + i, expectedValue.intValue(), value.getLong(0)); + } + } + } + + private static void assertCorrectNullCounts(ColumnIndex columnIndex, long... expectedNullCounts) + { + List nullCounts = columnIndex.getNullCounts(); + assertEquals(expectedNullCounts.length, nullCounts.size()); + for (int i = 0; i < expectedNullCounts.length; ++i) { + assertEquals("Invalid null count at page " + i, expectedNullCounts[i], nullCounts.get(i).longValue()); + } + } + + private static void assertCorrectNullPages(ColumnIndex columnIndex, boolean... expectedNullPages) + { + List nullPages = columnIndex.getNullPages(); + assertEquals(expectedNullPages.length, nullPages.size()); + for (int i = 0; i < expectedNullPages.length; ++i) { + assertEquals("Invalid null pages at page " + i, expectedNullPages[i], nullPages.get(i).booleanValue()); + } + } + + private static class StatsBuilder + { + private long minMaxSize; + + Statistics stats(PrimitiveType type, Object... values) + { + Statistics stats = Statistics.createStats(type); + for (Object value : values) { + if (value == null) { + stats.incrementNumNulls(); + continue; + } + switch (type.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + stats.updateStats((Binary) value); + break; + case BOOLEAN: + stats.updateStats((boolean) value); + break; + case DOUBLE: + stats.updateStats((double) value); + break; + case FLOAT: + stats.updateStats((float) value); + break; + case INT32: + stats.updateStats((int) value); + break; + case INT64: + stats.updateStats((long) value); + break; + default: + fail("Unsupported value type for stats: " + value.getClass()); + } + } + if (stats.hasNonNullValue()) { + minMaxSize += stats.getMinBytes().length; + minMaxSize += stats.getMaxBytes().length; + } + return stats; + } + + long getMinMaxSize() + { + return minMaxSize; + } + } + + private static void assertCorrectFiltering(ColumnIndex ci, FilterPredicate predicate, int... expectedIndexes) + { + checkEquals(predicate.accept(ci), expectedIndexes); + } + + static void checkEquals(PrimitiveIterator.OfInt actualIt, int... expectedValues) + { + IntList actualList = new IntArrayList(); + actualIt.forEachRemaining((int value) -> actualList.add(value)); + int[] actualValues = actualList.toIntArray(); + assertArrayEquals( + "ExpectedValues: " + Arrays.toString(expectedValues) + " ActualValues: " + Arrays.toString(actualValues), + expectedValues, actualValues); + } +} diff --git a/presto-parquet/src/test/java/com/facebook/presto/parquet/reader/TestColumnIndexFilter.java b/presto-parquet/src/test/java/com/facebook/presto/parquet/reader/TestColumnIndexFilter.java new file mode 100644 index 0000000000000..784d93b436c5e --- /dev/null +++ b/presto-parquet/src/test/java/com/facebook/presto/parquet/reader/TestColumnIndexFilter.java @@ -0,0 +1,510 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.parquet.reader; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.BoundaryOrder; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; +import org.apache.parquet.schema.PrimitiveType; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.LongStream; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.parquet.filter2.predicate.FilterApi.and; +import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.booleanColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.eq; +import static org.apache.parquet.filter2.predicate.FilterApi.gt; +import static org.apache.parquet.filter2.predicate.FilterApi.gtEq; +import static org.apache.parquet.filter2.predicate.FilterApi.intColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.longColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.lt; +import static org.apache.parquet.filter2.predicate.FilterApi.ltEq; +import static org.apache.parquet.filter2.predicate.FilterApi.notEq; +import static org.apache.parquet.filter2.predicate.FilterApi.or; +import static org.apache.parquet.filter2.predicate.FilterApi.userDefined; +import static org.apache.parquet.filter2.predicate.LogicalInverter.invert; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; +import static org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges; +import static org.apache.parquet.io.api.Binary.fromString; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Types.optional; +import static org.junit.Assert.assertArrayEquals; + +/** + * Unit tests of {@link ColumnIndexFilter} + */ +public class TestColumnIndexFilter +{ + private static class CIBuilder + { + private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]); + private final PrimitiveType type; + private final BoundaryOrder order; + private List nullPages = new ArrayList<>(); + private List nullCounts = new ArrayList<>(); + private List minValues = new ArrayList<>(); + private List maxValues = new ArrayList<>(); + + CIBuilder(PrimitiveType type, BoundaryOrder order) + { + this.type = type; + this.order = order; + } + + CIBuilder addNullPage(long nullCount) + { + nullPages.add(true); + nullCounts.add(nullCount); + minValues.add(EMPTY); + maxValues.add(EMPTY); + return this; + } + + CIBuilder addPage(long nullCount, int min, int max) + { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(min))); + maxValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(max))); + return this; + } + + CIBuilder addPage(long nullCount, String min, String max) + { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(min.getBytes(UTF_8))); + maxValues.add(ByteBuffer.wrap(max.getBytes(UTF_8))); + return this; + } + + CIBuilder addPage(long nullCount, double min, double max) + { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(min)))); + maxValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(max)))); + return this; + } + + ColumnIndex build() + { + return ColumnIndexBuilder.build(type, order, nullPages, nullCounts, minValues, maxValues); + } + } + + private static class OIBuilder + { + private final OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + + OIBuilder addPage(long rowCount) + { + builder.add(1234, rowCount); + return this; + } + + OffsetIndex build() + { + return builder.build(); + } + } + + public static class AnyInt + extends UserDefinedPredicate + { + @Override + public boolean keep(Integer value) + { + return true; + } + + @Override + public boolean canDrop(Statistics statistics) + { + return false; + } + + @Override + public boolean inverseCanDrop(Statistics statistics) + { + return true; + } + } + + /** + *
+     * row     column1        column2        column3        column4        column5
+     *                                                 (no column index)
+     *      ------0------  ------0------  ------0------  ------0------  ------0------
+     * 0.   1              Zulu           2.03                          null
+     *      ------1------  ------1------  ------1------  ------1------  ------1------
+     * 1.   2              Yankee         4.67                          null
+     * 2.   3              Xray           3.42                          null
+     * 3.   4              Whiskey        8.71                          null
+     *                     ------2------                 ------2------
+     * 4.   5              Victor         0.56                          null
+     * 5.   6              Uniform        4.30                          null
+     *                                    ------2------  ------3------
+     * 6.   null           null           null                          null
+     *      ------2------                                ------4------
+     * 7.   7              Tango          3.50                          null
+     *                     ------3------
+     * 8.   7              null           3.14                          null
+     *      ------3------
+     * 9.   7              null           null                          null
+     *                                    ------3------
+     * 10.  null           null           9.99                          null
+     *                     ------4------
+     * 11.  8              Sierra         8.78                          null
+     *                                                   ------5------
+     * 12.  9              Romeo          9.56                          null
+     * 13.  10             Quebec         2.71                          null
+     *      ------4------
+     * 14.  11             Papa           5.71                          null
+     * 15.  12             Oscar          4.09                          null
+     *                     ------5------  ------4------  ------6------
+     * 16.  13             November       null                          null
+     * 17.  14             Mike           null                          null
+     * 18.  15             Lima           0.36                          null
+     * 19.  16             Kilo           2.94                          null
+     * 20.  17             Juliett        4.23                          null
+     *      ------5------  ------6------                 ------7------
+     * 21.  18             India          null                          null
+     * 22.  19             Hotel          5.32                          null
+     *                                    ------5------
+     * 23.  20             Golf           4.17                          null
+     * 24.  21             Foxtrot        7.92                          null
+     * 25.  22             Echo           7.95                          null
+     *                                   ------6------
+     * 26.  23             Delta          null                          null
+     *      ------6------
+     * 27.  24             Charlie        null                          null
+     *                                                   ------8------
+     * 28.  25             Bravo          null                          null
+     *                     ------7------
+     * 29.  26             Alfa           null                          null
+     * 
+ */ + private static final long TOTAL_ROW_COUNT = 30; + private static final ColumnIndex COLUMN1_CI = new CIBuilder(optional(INT32).named("column1"), ASCENDING) + .addPage(0, 1, 1) + .addPage(1, 2, 6) + .addPage(0, 7, 7) + .addPage(1, 7, 10) + .addPage(0, 11, 17) + .addPage(0, 18, 23) + .addPage(0, 24, 26) + .build(); + private static final OffsetIndex COLUMN1_OI = new OIBuilder() + .addPage(1) + .addPage(6) + .addPage(2) + .addPage(5) + .addPage(7) + .addPage(6) + .addPage(3) + .build(); + private static final ColumnIndex COLUMN2_CI = new CIBuilder(optional(BINARY).as(stringType()).named("column2"), DESCENDING) + .addPage(0, "Zulu", "Zulu") + .addPage(0, "Whiskey", "Yankee") + .addPage(1, "Tango", "Victor") + .addNullPage(3) + .addPage(0, "Oscar", "Sierra") + .addPage(0, "Juliett", "November") + .addPage(0, "Bravo", "India") + .addPage(0, "Alfa", "Alfa") + .build(); + private static final OffsetIndex COLUMN2_OI = new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(4) + .addPage(3) + .addPage(5) + .addPage(5) + .addPage(8) + .addPage(1) + .build(); + private static final ColumnIndex COLUMN3_CI = new CIBuilder(optional(DOUBLE).named("column3"), UNORDERED) + .addPage(0, 2.03, 2.03) + .addPage(0, 0.56, 8.71) + .addPage(2, 3.14, 3.50) + .addPage(0, 2.71, 9.99) + .addPage(3, 0.36, 5.32) + .addPage(0, 4.17, 7.95) + .addNullPage(4) + .build(); + private static final OffsetIndex COLUMN3_OI = new OIBuilder() + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(6) + .addPage(7) + .addPage(3) + .addPage(4) + .build(); + private static final ColumnIndex COLUMN4_CI = null; + private static final OffsetIndex COLUMN4_OI = new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(2) + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(5) + .addPage(7) + .addPage(2) + .build(); + private static final ColumnIndex COLUMN5_CI = new CIBuilder(optional(INT64).named("column5"), ASCENDING) + .addNullPage(1) + .addNullPage(29) + .build(); + private static final OffsetIndex COLUMN5_OI = new OIBuilder() + .addPage(1) + .addPage(29) + .build(); + private static final ColumnIndexStore STORE = new ColumnIndexStore() + { + @Override + public ColumnIndex getColumnIndex(ColumnPath column) + { + switch (column.toDotString()) { + case "column1": + return COLUMN1_CI; + case "column2": + return COLUMN2_CI; + case "column3": + return COLUMN3_CI; + case "column4": + return COLUMN4_CI; + case "column5": + return COLUMN5_CI; + default: + return null; + } + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) + { + switch (column.toDotString()) { + case "column1": + return COLUMN1_OI; + case "column2": + return COLUMN2_OI; + case "column3": + return COLUMN3_OI; + case "column4": + return COLUMN4_OI; + case "column5": + return COLUMN5_OI; + default: + throw new MissingOffsetIndexException(column); + } + } + }; + + private static Set paths(String... columns) + { + Set paths = new HashSet<>(); + for (String column : columns) { + paths.add(ColumnPath.fromDotString(column)); + } + return paths; + } + + private static void assertAllRows(RowRanges ranges, long rowCount) + { + LongList actualList = new LongArrayList(); + ranges.iterator().forEachRemaining((long value) -> actualList.add(value)); + LongList expectedList = new LongArrayList(); + LongStream.range(0, rowCount).forEach(expectedList::add); + assertArrayEquals(expectedList + " != " + actualList, expectedList.toLongArray(), actualList.toLongArray()); + } + + private static void assertRows(RowRanges ranges, long... expectedRows) + { + LongList actualList = new LongArrayList(); + ranges.iterator().forEachRemaining((long value) -> actualList.add(value)); + assertArrayEquals(Arrays.toString(expectedRows) + " != " + actualList, expectedRows, actualList.toLongArray()); + } + + @Test + public void testFiltering() + { + Set paths = paths("column1", "column2", "column3", "column4"); + + assertAllRows( + calculateRowRanges(FilterCompat.get( + userDefined(intColumn("column1"), AnyInt.class)), STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + eq(intColumn("column1"), null), + eq(binaryColumn("column2"), null)), + and( + eq(doubleColumn("column3"), null), + eq(booleanColumn("column4"), null)))), + STORE, paths, TOTAL_ROW_COUNT), + 6, 9); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + notEq(intColumn("column1"), null), + notEq(binaryColumn("column2"), null)), + and( + notEq(doubleColumn("column3"), null), + notEq(booleanColumn("column4"), null)))), + STORE, paths, TOTAL_ROW_COUNT), + 0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); + assertRows(calculateRowRanges(FilterCompat.get( + or( + and( + lt(intColumn("column1"), 20), + gtEq(binaryColumn("column2"), fromString("Quebec"))), + and( + gt(doubleColumn("column3"), 5.32), + ltEq(binaryColumn("column4"), fromString("XYZ"))))), + STORE, paths, TOTAL_ROW_COUNT), + 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 23, 24, 25); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + gt(binaryColumn("column2"), fromString("India"))), + and( + eq(doubleColumn("column3"), null), + notEq(binaryColumn("column4"), null)))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 16, 17, 18, 19, 20); + assertRows(calculateRowRanges(FilterCompat.get( + and( + or( + invert(userDefined(intColumn("column1"), AnyInt.class)), + eq(binaryColumn("column2"), fromString("Echo"))), + eq(doubleColumn("column3"), 6.0))), + STORE, paths, TOTAL_ROW_COUNT), + 23, 24, 25); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + and( + gt(binaryColumn("column2"), fromString("Romeo")), + ltEq(binaryColumn("column2"), fromString("Tango"))))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 11, 12, 13); + } + + @Test + public void testFilteringOnMissingColumns() + { + Set paths = paths("column1", "column2", "column3", "column4"); + + // Missing column filter is always true + assertAllRows(calculateRowRanges(FilterCompat.get( + notEq(intColumn("missing_column"), 0)), + STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + eq(binaryColumn("missing_column"), null))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 8, 9, 10, 11, 12, 13); + + // Missing column filter is always false + assertRows(calculateRowRanges(FilterCompat.get( + or( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + notEq(binaryColumn("missing_column"), null))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 8, 9, 10, 11, 12, 13); + assertRows(calculateRowRanges(FilterCompat.get( + gt(intColumn("missing_column"), 0)), + STORE, paths, TOTAL_ROW_COUNT)); + } + + @Test + public void testFilteringWithMissingOffsetIndex() + { + Set paths = paths("column1", "column2", "column3", "column4", "column_wo_oi"); + + assertAllRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + and( + gt(binaryColumn("column2"), fromString("Romeo")), + ltEq(binaryColumn("column_wo_oi"), fromString("Tango"))))), + STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + } + + @Test + public void testFilteringWithAllNullPages() + { + Set paths = paths("column1", "column5"); + + assertAllRows(calculateRowRanges(FilterCompat.get( + notEq(longColumn("column5"), 1234567L)), + STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + assertAllRows(calculateRowRanges(FilterCompat.get( + or(gtEq(intColumn("column1"), 10), + notEq(longColumn("column5"), 1234567L))), + STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + assertRows(calculateRowRanges(FilterCompat.get( + eq(longColumn("column5"), 1234567L)), + STORE, paths, TOTAL_ROW_COUNT)); + assertRows(calculateRowRanges(FilterCompat.get( + and(lt(intColumn("column1"), 20), + gtEq(longColumn("column5"), 1234567L))), + STORE, paths, TOTAL_ROW_COUNT)); + } +}