diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java index 33eceb5168b5b..84722ffc0cbee 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java @@ -95,6 +95,14 @@ static SortedRangeSet of(Range first, Range... rest) return copyOf(first.getType(), rangeList); } + static SortedRangeSet of(List rangeList) + { + if (rangeList.isEmpty()) { + throw new IllegalArgumentException("cannot use empty rangeList"); + } + return copyOf(rangeList.get(0).getType(), rangeList); + } + /** * Provided Ranges are unioned together to form the SortedRangeSet */ diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/ValueSet.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/ValueSet.java index 527d0da8fac1e..fff1bcbc3a548 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/ValueSet.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/ValueSet.java @@ -19,6 +19,7 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; import java.util.Collection; +import java.util.List; import static java.util.stream.Collectors.toList; @@ -83,6 +84,11 @@ static ValueSet ofRanges(Range first, Range... rest) return SortedRangeSet.of(first, rest); } + static ValueSet ofRanges(List ranges) + { + return SortedRangeSet.of(ranges); + } + static ValueSet copyOfRanges(Type type, Collection ranges) { return SortedRangeSet.copyOf(type, ranges); diff --git a/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java b/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java index 7fb29e2773b75..91b90f508b402 100644 --- a/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java +++ b/presto-delta/src/main/java/com/facebook/presto/delta/DeltaPageSourceProvider.java @@ -79,7 +79,6 @@ import static com.facebook.presto.delta.DeltaErrorCode.DELTA_MISSING_DATA; import static com.facebook.presto.delta.DeltaErrorCode.DELTA_PARQUET_SCHEMA_MISMATCH; import static com.facebook.presto.delta.DeltaSessionProperties.getParquetMaxReadBlockSize; -import static com.facebook.presto.delta.DeltaSessionProperties.isFailOnCorruptedParquetStatistics; import static com.facebook.presto.delta.DeltaSessionProperties.isParquetBatchReaderVerificationEnabled; import static com.facebook.presto.delta.DeltaSessionProperties.isParquetBatchReadsEnabled; import static com.facebook.presto.delta.DeltaTypeUtils.convertPartitionValue; @@ -160,7 +159,6 @@ public ConnectorPageSource createPageSource( deltaSplit.getFileSize(), regularColumnHandles, deltaTableHandle.toSchemaTableName(), - isFailOnCorruptedParquetStatistics(session), getParquetMaxReadBlockSize(session), isParquetBatchReadsEnabled(session), isParquetBatchReaderVerificationEnabled(session), @@ -204,7 +202,6 @@ private static ConnectorPageSource createParquetPageSource( long fileSize, List columns, SchemaTableName tableName, - boolean failOnCorruptedParquetStatistics, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, @@ -247,7 +244,7 @@ private static ConnectorPageSource createParquetPageSource( final ParquetDataSource finalDataSource = dataSource; ImmutableList.Builder blocks = ImmutableList.builder(); for (BlockMetaData block : footerBlocks.build()) { - if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, failOnCorruptedParquetStatistics)) { + if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain)) { blocks.add(block); } } diff --git a/presto-delta/src/main/java/com/facebook/presto/delta/DeltaSessionProperties.java b/presto-delta/src/main/java/com/facebook/presto/delta/DeltaSessionProperties.java index 98e18c3359407..af8abbd8f96ca 100644 --- a/presto-delta/src/main/java/com/facebook/presto/delta/DeltaSessionProperties.java +++ b/presto-delta/src/main/java/com/facebook/presto/delta/DeltaSessionProperties.java @@ -30,7 +30,6 @@ public final class DeltaSessionProperties { private static final String CACHE_ENABLED = "cache_enabled"; - private static final String PARQUET_FAIL_WITH_CORRUPTED_STATISTICS = "parquet_fail_with_corrupted_statistics"; private static final String PARQUET_MAX_READ_BLOCK_SIZE = "parquet_max_read_block_size"; private static final String PARQUET_BATCH_READ_OPTIMIZATION_ENABLED = "parquet_batch_read_optimization_enabled"; private static final String PARQUET_BATCH_READER_VERIFICATION_ENABLED = "parquet_batch_reader_verification_enabled"; @@ -50,11 +49,6 @@ public DeltaSessionProperties( "Enable cache for Delta tables", cacheConfig.isCachingEnabled(), false), - booleanProperty( - PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, - "Parquet: Fail when scanning Parquet files with corrupted statistics", - hiveClientConfig.isFailOnCorruptedParquetStatistics(), - false), dataSizeSessionProperty( PARQUET_MAX_READ_BLOCK_SIZE, "Parquet: Maximum size of a block to read", @@ -87,11 +81,6 @@ public static boolean isCacheEnabled(ConnectorSession session) return session.getProperty(CACHE_ENABLED, Boolean.class); } - public static boolean isFailOnCorruptedParquetStatistics(ConnectorSession session) - { - return session.getProperty(PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, Boolean.class); - } - public static DataSize getParquetMaxReadBlockSize(ConnectorSession session) { return session.getProperty(PARQUET_MAX_READ_BLOCK_SIZE, DataSize.class); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java index d889cd16ac0b6..0c992c65e6261 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java @@ -49,6 +49,7 @@ "hive.max-global-split-iterator-threads", "hive.max-sort-files-per-bucket", "hive.bucket-writing", + "hive.parquet.fail-on-corrupted-statistics", "hive.optimized-reader.enabled"}) public class HiveClientConfig { @@ -101,7 +102,6 @@ public class HiveClientConfig private DataSize textMaxLineLength = new DataSize(100, MEGABYTE); private boolean useParquetColumnNames; - private boolean failOnCorruptedParquetStatistics = true; private DataSize parquetMaxReadBlockSize = new DataSize(16, MEGABYTE); private boolean assumeCanonicalPartitionKeys; @@ -935,19 +935,6 @@ public HiveClientConfig setUseParquetColumnNames(boolean useParquetColumnNames) return this; } - public boolean isFailOnCorruptedParquetStatistics() - { - return failOnCorruptedParquetStatistics; - } - - @Config("hive.parquet.fail-on-corrupted-statistics") - @ConfigDescription("Fail when scanning Parquet files with corrupted statistics") - public HiveClientConfig setFailOnCorruptedParquetStatistics(boolean failOnCorruptedParquetStatistics) - { - this.failOnCorruptedParquetStatistics = failOnCorruptedParquetStatistics; - return this; - } - @NotNull public DataSize getParquetMaxReadBlockSize() { diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java index a6cf022d4a117..b78708f748bfe 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveSessionProperties.java @@ -76,7 +76,6 @@ public final class HiveSessionProperties public static final String RESPECT_TABLE_FORMAT = "respect_table_format"; private static final String CREATE_EMPTY_BUCKET_FILES = "create_empty_bucket_files"; private static final String PARQUET_USE_COLUMN_NAME = "parquet_use_column_names"; - private static final String PARQUET_FAIL_WITH_CORRUPTED_STATISTICS = "parquet_fail_with_corrupted_statistics"; private static final String PARQUET_MAX_READ_BLOCK_SIZE = "parquet_max_read_block_size"; private static final String PARQUET_WRITER_BLOCK_SIZE = "parquet_writer_block_size"; private static final String PARQUET_WRITER_PAGE_SIZE = "parquet_writer_page_size"; @@ -327,11 +326,6 @@ public HiveSessionProperties(HiveClientConfig hiveClientConfig, OrcFileWriterCon "Experimental: Parquet: Access Parquet columns using names from the file", hiveClientConfig.isUseParquetColumnNames(), false), - booleanProperty( - PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, - "Parquet: Fail when scanning Parquet files with corrupted statistics", - hiveClientConfig.isFailOnCorruptedParquetStatistics(), - false), dataSizeSessionProperty( PARQUET_MAX_READ_BLOCK_SIZE, "Parquet: Maximum size of a block to read", @@ -822,11 +816,6 @@ public static boolean isUseParquetColumnNames(ConnectorSession session) return session.getProperty(PARQUET_USE_COLUMN_NAME, Boolean.class); } - public static boolean isFailOnCorruptedParquetStatistics(ConnectorSession session) - { - return session.getProperty(PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, Boolean.class); - } - public static DataSize getParquetMaxReadBlockSize(ConnectorSession session) { return session.getProperty(PARQUET_MAX_READ_BLOCK_SIZE, DataSize.class); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java index 48f99f9095d0d..fb1b4495f5379 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetPageSourceFactory.java @@ -95,7 +95,6 @@ import static com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA; import static com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH; import static com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize; -import static com.facebook.presto.hive.HiveSessionProperties.isFailOnCorruptedParquetStatistics; import static com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled; import static com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled; import static com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames; @@ -179,7 +178,6 @@ public Optional createPageSource( columns, tableName, isUseParquetColumnNames(session), - isFailOnCorruptedParquetStatistics(session), getParquetMaxReadBlockSize(session), isParquetBatchReadsEnabled(session), isParquetBatchReaderVerificationEnabled(session), @@ -202,7 +200,6 @@ public static ConnectorPageSource createParquetPageSource( List columns, SchemaTableName tableName, boolean useParquetColumnNames, - boolean failOnCorruptedParquetStatistics, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, @@ -252,7 +249,7 @@ public static ConnectorPageSource createParquetPageSource( final ParquetDataSource finalDataSource = dataSource; ImmutableList.Builder blocks = ImmutableList.builder(); for (BlockMetaData block : footerBlocks.build()) { - if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, failOnCorruptedParquetStatistics)) { + if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain)) { blocks.add(block); hiveFileContext.incrementCounter("parquet.blocksRead", 1); hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount()); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java index 3f9f2f447004b..e3dec3b5fa76b 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java @@ -85,7 +85,6 @@ public void testDefaults() .setWriteValidationThreads(16) .setTextMaxLineLength(new DataSize(100, Unit.MEGABYTE)) .setUseParquetColumnNames(false) - .setFailOnCorruptedParquetStatistics(true) .setParquetMaxReadBlockSize(new DataSize(16, Unit.MEGABYTE)) .setUseOrcColumnNames(false) .setAssumeCanonicalPartitionKeys(false) @@ -207,7 +206,6 @@ public void testExplicitPropertyMappings() .put("hive.assume-canonical-partition-keys", "true") .put("hive.text.max-line-length", "13MB") .put("hive.parquet.use-column-names", "true") - .put("hive.parquet.fail-on-corrupted-statistics", "false") .put("hive.parquet.max-read-block-size", "66kB") .put("hive.orc.use-column-names", "true") .put("hive.orc.bloom-filters.enabled", "true") @@ -325,7 +323,6 @@ public void testExplicitPropertyMappings() .setS3FileSystemType(S3FileSystemType.EMRFS) .setTextMaxLineLength(new DataSize(13, Unit.MEGABYTE)) .setUseParquetColumnNames(true) - .setFailOnCorruptedParquetStatistics(false) .setParquetMaxReadBlockSize(new DataSize(66, Unit.KILOBYTE)) .setUseOrcColumnNames(true) .setAssumeCanonicalPartitionKeys(true) diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java index 84cd7a79de230..d9bdeac92b6ce 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergPageSourceProvider.java @@ -96,7 +96,6 @@ import static com.facebook.presto.hive.CacheQuota.NO_CACHE_CONSTRAINTS; import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR; import static com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize; -import static com.facebook.presto.hive.HiveSessionProperties.isFailOnCorruptedParquetStatistics; import static com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled; import static com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled; import static com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames; @@ -230,7 +229,6 @@ private ConnectorPageSource createDataPageSource( tableName, dataColumns, isUseParquetColumnNames(session), - isFailOnCorruptedParquetStatistics(session), getParquetMaxReadBlockSize(session), isParquetBatchReadsEnabled(session), isParquetBatchReaderVerificationEnabled(session), @@ -290,7 +288,6 @@ private static ConnectorPageSource createParquetPageSource( SchemaTableName tableName, List regularColumns, boolean useParquetColumnNames, - boolean failOnCorruptedParquetStatistics, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, @@ -338,7 +335,7 @@ private static ConnectorPageSource createParquetPageSource( for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if ((firstDataPage >= start) && (firstDataPage < (start + length)) && - predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, failOnCorruptedParquetStatistics)) { + predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain)) { blocks.add(block); } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java index fb25da38f483f..bc2ce234c0d2f 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java @@ -45,7 +45,6 @@ public final class IcebergSessionProperties { private static final String COMPRESSION_CODEC = "compression_codec"; - private static final String PARQUET_FAIL_WITH_CORRUPTED_STATISTICS = "parquet_fail_with_corrupted_statistics"; private static final String PARQUET_MAX_READ_BLOCK_SIZE = "parquet_max_read_block_size"; private static final String PARQUET_WRITER_BLOCK_SIZE = "parquet_writer_block_size"; private static final String PARQUET_WRITER_PAGE_SIZE = "parquet_writer_page_size"; @@ -92,11 +91,6 @@ public IcebergSessionProperties( false, value -> HiveCompressionCodec.valueOf(((String) value).toUpperCase()), HiveCompressionCodec::name), - booleanProperty( - PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, - "Parquet: Fail when scanning Parquet files with corrupted statistics", - hiveClientConfig.isFailOnCorruptedParquetStatistics(), - false), booleanProperty( PARQUET_USE_COLUMN_NAMES, "Experimental: Parquet: Access Parquet columns using names from the file", @@ -259,11 +253,6 @@ public static HiveCompressionCodec getCompressionCodec(ConnectorSession session) return session.getProperty(COMPRESSION_CODEC, HiveCompressionCodec.class); } - public static boolean isFailOnCorruptedParquetStatistics(ConnectorSession session) - { - return session.getProperty(PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, Boolean.class); - } - public static DataSize getParquetMaxReadBlockSize(ConnectorSession session) { return session.getProperty(PARQUET_MAX_READ_BLOCK_SIZE, DataSize.class); diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetDoubleStatistics.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetDoubleStatistics.java deleted file mode 100644 index 1879663736f2a..0000000000000 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetDoubleStatistics.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.parquet.predicate; - -public class ParquetDoubleStatistics - implements ParquetRangeStatistics -{ - private final Double minimum; - private final Double maximum; - - public ParquetDoubleStatistics(Double minimum, Double maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - @Override - public Double getMin() - { - return minimum; - } - - @Override - public Double getMax() - { - return maximum; - } -} diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetIntegerStatistics.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetIntegerStatistics.java deleted file mode 100644 index 46f0abde71bd2..0000000000000 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetIntegerStatistics.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.parquet.predicate; - -public class ParquetIntegerStatistics - implements ParquetRangeStatistics -{ - private final Long minimum; - private final Long maximum; - - public ParquetIntegerStatistics(Long minimum, Long maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - @Override - public Long getMin() - { - return minimum; - } - - @Override - public Long getMax() - { - return maximum; - } -} diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetRangeStatistics.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetRangeStatistics.java deleted file mode 100644 index 43f54e55b53f7..0000000000000 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetRangeStatistics.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.parquet.predicate; - -public interface ParquetRangeStatistics -{ - T getMin(); - - T getMax(); -} diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetStringStatistics.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetStringStatistics.java deleted file mode 100644 index 89fc21d8e25db..0000000000000 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/ParquetStringStatistics.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.parquet.predicate; - -import io.airlift.slice.Slice; - -public class ParquetStringStatistics - implements ParquetRangeStatistics -{ - private final Slice minimum; - private final Slice maximum; - - public ParquetStringStatistics(Slice minimum, Slice maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - @Override - public Slice getMin() - { - return minimum; - } - - @Override - public Slice getMax() - { - return maximum; - } -} diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java index 1295e163e9c37..0cf048381a71d 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/Predicate.java @@ -25,7 +25,7 @@ public interface Predicate Predicate TRUE = new Predicate() { @Override - public boolean matches(long numberOfRows, Map> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) + public boolean matches(long numberOfRows, Map> statistics, ParquetDataSourceId id) throws ParquetCorruptionException { return true; @@ -45,9 +45,8 @@ public boolean matches(DictionaryDescriptor dictionary) * Statistics to determine if a column is only null * @param statistics column statistics * @param id Parquet file name - * @param failOnCorruptedParquetStatistics whether to fail query when scanning a Parquet file with corrupted statistics */ - boolean matches(long numberOfRows, Map> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) + boolean matches(long numberOfRows, Map> statistics, ParquetDataSourceId id) throws ParquetCorruptionException; /** diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java index 6ef5205e6d429..364ab2ff9c164 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/PredicateUtils.java @@ -65,10 +65,8 @@ private PredicateUtils() { } - public static boolean isStatisticsOverflow(Type type, ParquetIntegerStatistics parquetIntegerStatistics) + public static boolean isStatisticsOverflow(Type type, long min, long max) { - long min = parquetIntegerStatistics.getMin(); - long max = parquetIntegerStatistics.getMax(); return (type.equals(TINYINT) && (min < Byte.MIN_VALUE || max > Byte.MAX_VALUE)) || (type.equals(SMALLINT) && (min < Short.MIN_VALUE || max > Short.MAX_VALUE)) || (type.equals(INTEGER) && (min < Integer.MIN_VALUE || max > Integer.MAX_VALUE)); @@ -86,11 +84,11 @@ public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain< return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build()); } - public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map, RichColumnDescriptor> descriptorsByPath, TupleDomain parquetTupleDomain, boolean failOnCorruptedParquetStatistics) + public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map, RichColumnDescriptor> descriptorsByPath, TupleDomain parquetTupleDomain) throws ParquetCorruptionException { Map> columnStatistics = getStatistics(block, descriptorsByPath); - if (!parquetPredicate.matches(block.getRowCount(), columnStatistics, dataSource.getId(), failOnCorruptedParquetStatistics)) { + if (!parquetPredicate.matches(block.getRowCount(), columnStatistics, dataSource.getId())) { return false; } diff --git a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java index 9bdf4bd122c27..531e5284e36bf 100644 --- a/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java +++ b/presto-parquet/src/main/java/com/facebook/presto/parquet/predicate/TupleDomainParquetPredicate.java @@ -29,14 +29,9 @@ import io.airlift.slice.Slice; import io.airlift.slice.Slices; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.statistics.BinaryStatistics; -import org.apache.parquet.column.statistics.BooleanStatistics; -import org.apache.parquet.column.statistics.DoubleStatistics; -import org.apache.parquet.column.statistics.FloatStatistics; -import org.apache.parquet.column.statistics.IntStatistics; -import org.apache.parquet.column.statistics.LongStatistics; import org.apache.parquet.column.statistics.Statistics; -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; import java.util.ArrayList; import java.util.List; @@ -54,6 +49,7 @@ import static com.facebook.presto.common.type.TinyintType.TINYINT; import static com.facebook.presto.common.type.Varchars.isVarcharType; import static com.facebook.presto.parquet.predicate.PredicateUtils.isStatisticsOverflow; +import static com.google.common.base.Preconditions.checkArgument; import static java.lang.Float.floatToRawIntBits; import static java.lang.String.format; import static java.util.Objects.requireNonNull; @@ -71,7 +67,7 @@ public TupleDomainParquetPredicate(TupleDomain effectivePredic } @Override - public boolean matches(long numberOfRows, Map> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) + public boolean matches(long numberOfRows, Map> statistics, ParquetDataSourceId id) throws ParquetCorruptionException { if (numberOfRows == 0) { @@ -96,7 +92,12 @@ public boolean matches(long numberOfRows, Map> s // no stats for column } else { - Domain domain = getDomain(effectivePredicateDomain.getType(), numberOfRows, columnStatistics, id, column.toString(), failOnCorruptedParquetStatistics); + Domain domain = getDomain( + column, + effectivePredicateDomain.getType(), + numberOfRows, + columnStatistics, + id); if (effectivePredicateDomain.intersect(domain).isNone()) { return false; } @@ -127,7 +128,12 @@ private static boolean effectivePredicateMatches(Domain effectivePredicateDomain } @VisibleForTesting - public static Domain getDomain(Type type, long rowCount, Statistics statistics, ParquetDataSourceId id, String column, boolean failOnCorruptedParquetStatistics) + public static Domain getDomain( + ColumnDescriptor column, + Type type, + long rowCount, + Statistics statistics, + ParquetDataSourceId id) throws ParquetCorruptionException { if (statistics == null || statistics.isEmpty()) { @@ -144,11 +150,36 @@ public static Domain getDomain(Type type, long rowCount, Statistics statistic return Domain.create(ValueSet.all(type), hasNullValue); } - if (type.equals(BOOLEAN) && statistics instanceof BooleanStatistics) { - BooleanStatistics booleanStatistics = (BooleanStatistics) statistics; + try { + return getDomain( + column, + type, + ImmutableList.of(statistics.genericGetMin()), + ImmutableList.of(statistics.genericGetMax()), + hasNullValue); + } + catch (Exception exception) { + throw new ParquetCorruptionException(exception, format("Corrupted statistics for column \"%s\" in Parquet file \"%s\": [%s]", column.toString(), id, statistics)); + } + } + + /** + * Get a domain for the ranges defined by each pair of elements from {@code minimums} and {@code maximums}. + * Both arrays must have the same length. + */ + private static Domain getDomain( + ColumnDescriptor column, + Type type, + List minimums, + List maximums, + boolean hasNullValue) + { + checkArgument(minimums.size() == maximums.size(), "Expected minimums and maximums to have the same size"); - boolean hasTrueValues = !(booleanStatistics.getMax() == false && booleanStatistics.getMin() == false); - boolean hasFalseValues = !(booleanStatistics.getMax() == true && booleanStatistics.getMin() == true); + List ranges = new ArrayList<>(); + if (type.equals(BOOLEAN)) { + boolean hasTrueValues = minimums.stream().anyMatch(value -> (boolean) value) || maximums.stream().anyMatch(value -> (boolean) value); + boolean hasFalseValues = minimums.stream().anyMatch(value -> !(boolean) value) || maximums.stream().anyMatch(value -> !(boolean) value); if (hasTrueValues && hasFalseValues) { return Domain.all(type); } @@ -162,85 +193,70 @@ public static Domain getDomain(Type type, long rowCount, Statistics statistic throw new VerifyException("Impossible boolean statistics"); } - if ((type.equals(BIGINT) || type.equals(TINYINT) || type.equals(SMALLINT) || type.equals(INTEGER)) && (statistics instanceof LongStatistics || statistics instanceof IntStatistics)) { - ParquetIntegerStatistics parquetIntegerStatistics; - if (statistics instanceof LongStatistics) { - LongStatistics longStatistics = (LongStatistics) statistics; - if (longStatistics.genericGetMin() > longStatistics.genericGetMax()) { - failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, longStatistics); + if ((type.equals(BIGINT) || type.equals(TINYINT) || type.equals(SMALLINT) || type.equals(INTEGER))) { + for (int i = 0; i < minimums.size(); i++) { + long min = asLong(minimums.get(i)); + long max = asLong(maximums.get(i)); + if (isStatisticsOverflow(type, min, max)) { return Domain.create(ValueSet.all(type), hasNullValue); } - parquetIntegerStatistics = new ParquetIntegerStatistics(longStatistics.genericGetMin(), longStatistics.genericGetMax()); - } - else { - IntStatistics intStatistics = (IntStatistics) statistics; - if (intStatistics.genericGetMin() > intStatistics.genericGetMax()) { - failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, intStatistics); - return Domain.create(ValueSet.all(type), hasNullValue); - } - parquetIntegerStatistics = new ParquetIntegerStatistics((long) intStatistics.getMin(), (long) intStatistics.getMax()); - } - if (isStatisticsOverflow(type, parquetIntegerStatistics)) { - return Domain.create(ValueSet.all(type), hasNullValue); + + ranges.add(Range.range(type, min, true, max, true)); } - return createDomain(type, hasNullValue, parquetIntegerStatistics); + checkArgument(!ranges.isEmpty(), "cannot use empty ranges"); + return Domain.create(ValueSet.ofRanges(ranges), hasNullValue); } - if (type.equals(REAL) && statistics instanceof FloatStatistics) { - FloatStatistics floatStatistics = (FloatStatistics) statistics; - if (floatStatistics.genericGetMin() > floatStatistics.genericGetMax()) { - failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, floatStatistics); - return Domain.create(ValueSet.all(type), hasNullValue); - } + if (type.equals(REAL)) { + for (int i = 0; i < minimums.size(); i++) { + Float min = (Float) minimums.get(i); + Float max = (Float) maximums.get(i); - if (floatStatistics.genericGetMin().isNaN() || floatStatistics.genericGetMax().isNaN()) { - return Domain.create(ValueSet.all(type), hasNullValue); + if (min.isNaN() || max.isNaN()) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + ranges.add(Range.range(type, (long) floatToRawIntBits(min), true, (long) floatToRawIntBits(max), true)); } - - ParquetIntegerStatistics parquetStatistics = new ParquetIntegerStatistics( - (long) floatToRawIntBits(floatStatistics.getMin()), - (long) floatToRawIntBits(floatStatistics.getMax())); - - return createDomain(type, hasNullValue, parquetStatistics); + checkArgument(!ranges.isEmpty(), "cannot use empty ranges"); + return Domain.create(ValueSet.ofRanges(ranges), hasNullValue); } - if (type.equals(DOUBLE) && statistics instanceof DoubleStatistics) { - DoubleStatistics doubleStatistics = (DoubleStatistics) statistics; - if (doubleStatistics.genericGetMin() > doubleStatistics.genericGetMax()) { - failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, doubleStatistics); - return Domain.create(ValueSet.all(type), hasNullValue); - } + if (type.equals(DOUBLE)) { + for (int i = 0; i < minimums.size(); i++) { + Double min = (Double) minimums.get(i); + Double max = (Double) maximums.get(i); + if (min.isNaN() || max.isNaN()) { + return Domain.create(ValueSet.all(type), hasNullValue); + } - if (doubleStatistics.genericGetMin().isNaN() || doubleStatistics.genericGetMax().isNaN()) { - return Domain.create(ValueSet.all(type), hasNullValue); + ranges.add(Range.range(type, min, true, max, true)); } - - ParquetDoubleStatistics parquetDoubleStatistics = new ParquetDoubleStatistics(doubleStatistics.genericGetMin(), doubleStatistics.genericGetMax()); - return createDomain(type, hasNullValue, parquetDoubleStatistics); + checkArgument(!ranges.isEmpty(), "cannot use empty ranges"); + return Domain.create(ValueSet.ofRanges(ranges), hasNullValue); } - if (isVarcharType(type) && statistics instanceof BinaryStatistics) { - BinaryStatistics binaryStatistics = (BinaryStatistics) statistics; - Slice minSlice = Slices.wrappedBuffer(binaryStatistics.genericGetMin().getBytes()); - Slice maxSlice = Slices.wrappedBuffer(binaryStatistics.genericGetMax().getBytes()); - if (minSlice.compareTo(maxSlice) > 0) { - failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, binaryStatistics); - return Domain.create(ValueSet.all(type), hasNullValue); + if (isVarcharType(type)) { + for (int i = 0; i < minimums.size(); i++) { + Slice min = Slices.wrappedBuffer(((Binary) minimums.get(i)).toByteBuffer()); + Slice max = Slices.wrappedBuffer(((Binary) maximums.get(i)).toByteBuffer()); + ranges.add(Range.range(type, min, true, max, true)); } - ParquetStringStatistics parquetStringStatistics = new ParquetStringStatistics(minSlice, maxSlice); - return createDomain(type, hasNullValue, parquetStringStatistics); + checkArgument(!ranges.isEmpty(), "cannot use empty ranges"); + return Domain.create(ValueSet.ofRanges(ranges), hasNullValue); } - if (type.equals(DATE) && statistics instanceof IntStatistics) { - IntStatistics intStatistics = (IntStatistics) statistics; - if (intStatistics.genericGetMin() > intStatistics.genericGetMax()) { - failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, intStatistics); - return Domain.create(ValueSet.all(type), hasNullValue); + if (type.equals(DATE)) { + for (int i = 0; i < minimums.size(); i++) { + long min = asLong(minimums.get(i)); + long max = asLong(maximums.get(i)); + if (isStatisticsOverflow(type, min, max)) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + ranges.add(Range.range(type, min, true, max, true)); } - ParquetIntegerStatistics parquetIntegerStatistics = new ParquetIntegerStatistics((long) intStatistics.getMin(), (long) intStatistics.getMax()); - return createDomain(type, hasNullValue, parquetIntegerStatistics); + checkArgument(!ranges.isEmpty(), "cannot use empty ranges"); + return Domain.create(ValueSet.ofRanges(ranges), hasNullValue); } - return Domain.create(ValueSet.all(type), hasNullValue); } @@ -264,92 +280,57 @@ public static Domain getDomain(Type type, DictionaryDescriptor dictionaryDescrip catch (Exception e) { // In case of exception, just continue reading the data, not using dictionary page at all // OK to ignore exception when reading dictionaries - // TODO take failOnCorruptedParquetStatistics parameter and handle appropriately return Domain.all(type); } int dictionarySize = dictionaryPage.get().getDictionarySize(); - if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT64) { - List values = new ArrayList<>(dictionarySize); - for (int i = 0; i < dictionarySize; i++) { - values.add(dictionary.decodeToLong(i)); - } - return Domain.create(ValueSet.copyOf(type, values), true); - } - - if ((type.equals(BIGINT) || type.equals(DATE)) && columnDescriptor.getType() == PrimitiveTypeName.INT32) { - List values = new ArrayList<>(dictionarySize); - for (int i = 0; i < dictionarySize; i++) { - values.add((long) dictionary.decodeToInt(i)); - } - return Domain.create(ValueSet.copyOf(type, values), true); - } - - if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.DOUBLE) { - List values = new ArrayList<>(dictionarySize); - for (int i = 0; i < dictionarySize; i++) { - double value = dictionary.decodeToDouble(i); - if (Double.isNaN(value)) { - return Domain.all(type); - } - values.add(value); - } - return Domain.create(ValueSet.copyOf(type, values), true); + DictionaryValueConverter converter = new DictionaryValueConverter(dictionary); + Function convertFunction = converter.getConverter(columnDescriptor.getPrimitiveType()); + List values = new ArrayList<>(); + for (int i = 0; i < dictionarySize; i++) { + values.add(convertFunction.apply(i)); } - if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.FLOAT) { - List values = new ArrayList<>(dictionarySize); - for (int i = 0; i < dictionarySize; i++) { - float value = dictionary.decodeToFloat(i); - if (Float.isNaN(value)) { - return Domain.all(type); - } - values.add((double) value); - } - return Domain.create(ValueSet.copyOf(type, values), true); - } - - if (isVarcharType(type) && columnDescriptor.getType() == PrimitiveTypeName.BINARY) { - List values = new ArrayList<>(dictionarySize); - for (int i = 0; i < dictionarySize; i++) { - values.add(Slices.wrappedBuffer(dictionary.decodeToBinary(i).getBytes())); - } - return Domain.create(ValueSet.copyOf(type, values), true); - } - - return Domain.all(type); + // TODO: when min == max (i.e., singleton ranges, the construction of Domains can be done more efficiently + return getDomain(columnDescriptor, type, values, values, true); } - private static void failWithCorruptionException(boolean failOnCorruptedParquetStatistics, String column, ParquetDataSourceId id, Statistics statistics) - throws ParquetCorruptionException + public static long asLong(Object value) { - if (failOnCorruptedParquetStatistics) { - throw new ParquetCorruptionException(format("Corrupted statistics for column \"%s\" in Parquet file \"%s\": [%s]", column, id, statistics)); + if (value instanceof Byte || value instanceof Short || value instanceof Integer || value instanceof Long) { + return ((Number) value).longValue(); } - } - private static > Domain createDomain(Type type, boolean hasNullValue, ParquetRangeStatistics rangeStatistics) - { - return createDomain(type, hasNullValue, rangeStatistics, value -> value); + throw new IllegalArgumentException("Can't convert value to long: " + value.getClass().getName()); } - private static > Domain createDomain(Type type, - boolean hasNullValue, - ParquetRangeStatistics rangeStatistics, - Function function) + private static class DictionaryValueConverter { - F min = rangeStatistics.getMin(); - F max = rangeStatistics.getMax(); + private final Dictionary dictionary; - if (min != null && max != null) { - return Domain.create(ValueSet.ofRanges(Range.range(type, function.apply(min), true, function.apply(max), true)), hasNullValue); - } - if (max != null) { - return Domain.create(ValueSet.ofRanges(Range.lessThanOrEqual(type, function.apply(max))), hasNullValue); + private DictionaryValueConverter(Dictionary dictionary) + { + this.dictionary = dictionary; } - if (min != null) { - return Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(type, function.apply(min))), hasNullValue); + + private Function getConverter(PrimitiveType primitiveType) + { + switch (primitiveType.getPrimitiveTypeName()) { + case INT32: + return (i) -> dictionary.decodeToInt(i); + case INT64: + return (i) -> dictionary.decodeToLong(i); + case FLOAT: + return (i) -> dictionary.decodeToFloat(i); + case DOUBLE: + return (i) -> dictionary.decodeToDouble(i); + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + case INT96: + return (i) -> dictionary.decodeToBinary(i); + default: + throw new IllegalArgumentException("Unsupported Parquet primitive type: " + primitiveType.getPrimitiveTypeName()); + } } - return Domain.create(ValueSet.all(type), hasNullValue); } } diff --git a/presto-parquet/src/test/java/com/facebook/presto/parquet/TestTupleDomainParquetPredicate.java b/presto-parquet/src/test/java/com/facebook/presto/parquet/TestTupleDomainParquetPredicate.java index 9ab5e7f6cb1f7..93c6d708e6d5a 100644 --- a/presto-parquet/src/test/java/com/facebook/presto/parquet/TestTupleDomainParquetPredicate.java +++ b/presto-parquet/src/test/java/com/facebook/presto/parquet/TestTupleDomainParquetPredicate.java @@ -64,10 +64,13 @@ import static java.util.Collections.singletonMap; import static org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType; import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; import static org.assertj.core.api.Assertions.assertThatExceptionOfType; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; @@ -81,13 +84,13 @@ public class TestTupleDomainParquetPredicate public void testBoolean() throws ParquetCorruptionException { - String column = "BooleanColumn"; - assertEquals(getDomain(BOOLEAN, 0, null, ID, column, true), Domain.all(BOOLEAN)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(PrimitiveTypeName.BOOLEAN, "BooleanColumn"); + assertEquals(getDomain(columnDescriptor, BOOLEAN, 0, null, ID), Domain.all(BOOLEAN)); - assertEquals(getDomain(BOOLEAN, 10, booleanColumnStats(true, true), ID, column, true), singleValue(BOOLEAN, true)); - assertEquals(getDomain(BOOLEAN, 10, booleanColumnStats(false, false), ID, column, true), singleValue(BOOLEAN, false)); + assertEquals(getDomain(columnDescriptor, BOOLEAN, 10, booleanColumnStats(true, true), ID), singleValue(BOOLEAN, true)); + assertEquals(getDomain(columnDescriptor, BOOLEAN, 10, booleanColumnStats(false, false), ID), singleValue(BOOLEAN, false)); - assertEquals(getDomain(BOOLEAN, 20, booleanColumnStats(false, true), ID, column, true), Domain.all(BOOLEAN)); + assertEquals(getDomain(columnDescriptor, BOOLEAN, 20, booleanColumnStats(false, true), ID), Domain.all(BOOLEAN)); } private static BooleanStatistics booleanColumnStats(boolean minimum, boolean maximum) @@ -101,138 +104,126 @@ private static BooleanStatistics booleanColumnStats(boolean minimum, boolean max public void testBigint() throws ParquetCorruptionException { - String column = "BigintColumn"; - assertEquals(getDomain(BIGINT, 0, null, ID, column, true), Domain.all(BIGINT)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(INT64, "BigintColumn"); + assertEquals(getDomain(columnDescriptor, BIGINT, 0, null, ID), Domain.all(BIGINT)); - assertEquals(getDomain(BIGINT, 10, longColumnStats(100L, 100L), ID, column, true), singleValue(BIGINT, 100L)); + assertEquals(getDomain(columnDescriptor, BIGINT, 10, longColumnStats(100L, 100L), ID), singleValue(BIGINT, 100L)); - assertEquals(getDomain(BIGINT, 10, longColumnStats(0L, 100L), ID, column, true), create(ValueSet.ofRanges(range(BIGINT, 0L, true, 100L, true)), false)); + assertEquals(getDomain(columnDescriptor, BIGINT, 10, longColumnStats(0L, 100L), ID), create(ValueSet.ofRanges(range(BIGINT, 0L, true, 100L, true)), false)); - assertEquals(getDomain(BIGINT, 20, longOnlyNullsStats(10), ID, column, true), create(ValueSet.all(BIGINT), true)); - // ignore corrupted statistics - assertEquals(getDomain(BIGINT, 10, longColumnStats(100L, 0L), ID, column, false), create(ValueSet.all(BIGINT), false)); + assertEquals(getDomain(columnDescriptor, BIGINT, 20, longOnlyNullsStats(10), ID), create(ValueSet.all(BIGINT), true)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(BIGINT, 10, longColumnStats(100L, 10L), ID, column, true)) - .withMessage("Corrupted statistics for column \"BigintColumn\" in Parquet file \"testFile\": [min: 100, max: 10, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, BIGINT, 10, longColumnStats(100L, 10L), ID)) + .withMessage("Corrupted statistics for column \"[] required int64 BigintColumn\" in Parquet file \"testFile\": [min: 100, max: 10, num_nulls: 0]"); } @Test public void testInteger() throws ParquetCorruptionException { - String column = "IntegerColumn"; - assertEquals(getDomain(INTEGER, 0, null, ID, column, true), Domain.all(INTEGER)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(INT32, "IntegerColumn"); + assertEquals(getDomain(columnDescriptor, INTEGER, 0, null, ID), Domain.all(INTEGER)); - assertEquals(getDomain(INTEGER, 10, longColumnStats(100, 100), ID, column, true), singleValue(INTEGER, 100L)); + assertEquals(getDomain(columnDescriptor, INTEGER, 10, longColumnStats(100, 100), ID), singleValue(INTEGER, 100L)); - assertEquals(getDomain(INTEGER, 10, longColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(INTEGER, 0L, true, 100L, true)), false)); + assertEquals(getDomain(columnDescriptor, INTEGER, 10, longColumnStats(0, 100), ID), create(ValueSet.ofRanges(range(INTEGER, 0L, true, 100L, true)), false)); - assertEquals(getDomain(INTEGER, 20, longColumnStats(0, 2147483648L), ID, column, true), notNull(INTEGER)); + assertEquals(getDomain(columnDescriptor, INTEGER, 20, longColumnStats(0, 2147483648L), ID), notNull(INTEGER)); - assertEquals(getDomain(INTEGER, 20, longOnlyNullsStats(10), ID, column, true), create(ValueSet.all(INTEGER), true)); - // ignore corrupted statistics - assertEquals(getDomain(INTEGER, 10, longColumnStats(2147483648L, 0), ID, column, false), create(ValueSet.all(INTEGER), false)); + assertEquals(getDomain(columnDescriptor, INTEGER, 20, longOnlyNullsStats(10), ID), create(ValueSet.all(INTEGER), true)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(INTEGER, 10, longColumnStats(2147483648L, 10), ID, column, true)) - .withMessage("Corrupted statistics for column \"IntegerColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, INTEGER, 10, longColumnStats(2147483648L, 10), ID)) + .withMessage("Corrupted statistics for column \"[] required int32 IntegerColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); } @Test public void testSmallint() throws ParquetCorruptionException { - String column = "SmallintColumn"; - assertEquals(getDomain(SMALLINT, 0, null, ID, column, true), Domain.all(SMALLINT)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(INT32, "SmallintColumn"); + assertEquals(getDomain(columnDescriptor, SMALLINT, 0, null, ID), Domain.all(SMALLINT)); - assertEquals(getDomain(SMALLINT, 10, longColumnStats(100, 100), ID, column, true), singleValue(SMALLINT, 100L)); + assertEquals(getDomain(columnDescriptor, SMALLINT, 10, longColumnStats(100, 100), ID), singleValue(SMALLINT, 100L)); - assertEquals(getDomain(SMALLINT, 10, longColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(SMALLINT, 0L, true, 100L, true)), false)); + assertEquals(getDomain(columnDescriptor, SMALLINT, 10, longColumnStats(0, 100), ID), create(ValueSet.ofRanges(range(SMALLINT, 0L, true, 100L, true)), false)); - assertEquals(getDomain(SMALLINT, 20, longColumnStats(0, 2147483648L), ID, column, true), notNull(SMALLINT)); + assertEquals(getDomain(columnDescriptor, SMALLINT, 20, longColumnStats(0, 2147483648L), ID), notNull(SMALLINT)); - assertEquals(getDomain(SMALLINT, 20, longOnlyNullsStats(10), ID, column, true), create(ValueSet.all(SMALLINT), true)); - // ignore corrupted statistics - assertEquals(getDomain(SMALLINT, 10, longColumnStats(2147483648L, 0), ID, column, false), create(ValueSet.all(SMALLINT), false)); + assertEquals(getDomain(columnDescriptor, SMALLINT, 20, longOnlyNullsStats(10), ID), create(ValueSet.all(SMALLINT), true)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(SMALLINT, 10, longColumnStats(2147483648L, 10), ID, column, true)) - .withMessage("Corrupted statistics for column \"SmallintColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, SMALLINT, 10, longColumnStats(2147483648L, 10), ID)) + .withMessage("Corrupted statistics for column \"[] required int32 SmallintColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); } @Test public void testTinyint() throws ParquetCorruptionException { - String column = "TinyintColumn"; - assertEquals(getDomain(TINYINT, 0, null, ID, column, true), Domain.all(TINYINT)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(INT32, "TinyintColumn"); + assertEquals(getDomain(columnDescriptor, TINYINT, 0, null, ID), Domain.all(TINYINT)); - assertEquals(getDomain(TINYINT, 10, longColumnStats(100, 100), ID, column, true), singleValue(TINYINT, 100L)); + assertEquals(getDomain(columnDescriptor, TINYINT, 10, longColumnStats(100, 100), ID), singleValue(TINYINT, 100L)); - assertEquals(getDomain(TINYINT, 10, longColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(TINYINT, 0L, true, 100L, true)), false)); + assertEquals(getDomain(columnDescriptor, TINYINT, 10, longColumnStats(0, 100), ID), create(ValueSet.ofRanges(range(TINYINT, 0L, true, 100L, true)), false)); - assertEquals(getDomain(TINYINT, 20, longColumnStats(0, 2147483648L), ID, column, true), notNull(TINYINT)); + assertEquals(getDomain(columnDescriptor, TINYINT, 20, longColumnStats(0, 2147483648L), ID), notNull(TINYINT)); - assertEquals(getDomain(TINYINT, 20, longOnlyNullsStats(10), ID, column, true), create(ValueSet.all(TINYINT), true)); - // ignore corrupted statistics - assertEquals(getDomain(TINYINT, 10, longColumnStats(2147483648L, 0), ID, column, false), create(ValueSet.all(TINYINT), false)); + assertEquals(getDomain(columnDescriptor, TINYINT, 20, longOnlyNullsStats(10), ID), create(ValueSet.all(TINYINT), true)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(TINYINT, 10, longColumnStats(2147483648L, 10), ID, column, true)) - .withMessage("Corrupted statistics for column \"TinyintColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, TINYINT, 10, longColumnStats(2147483648L, 10), ID)) + .withMessage("Corrupted statistics for column \"[] required int32 TinyintColumn\" in Parquet file \"testFile\": [min: 2147483648, max: 10, num_nulls: 0]"); } @Test public void testDouble() throws Exception { - String column = "DoubleColumn"; - assertEquals(getDomain(DOUBLE, 0, null, ID, column, true), Domain.all(DOUBLE)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(PrimitiveTypeName.DOUBLE, "DoubleColumn"); + assertEquals(getDomain(columnDescriptor, DOUBLE, 0, null, ID), Domain.all(DOUBLE)); - assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(42.24, 42.24), ID, column, true), singleValue(DOUBLE, 42.24)); + assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(42.24, 42.24), ID), singleValue(DOUBLE, 42.24)); - assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(3.3, 42.24), ID, column, true), create(ValueSet.ofRanges(range(DOUBLE, 3.3, true, 42.24, true)), false)); + assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(3.3, 42.24), ID), create(ValueSet.ofRanges(range(DOUBLE, 3.3, true, 42.24, true)), false)); - assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(NaN, NaN), ID, column, true), Domain.notNull(DOUBLE)); + assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(NaN, NaN), ID), Domain.notNull(DOUBLE)); - assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(NaN, NaN, true), ID, column, true), Domain.all(DOUBLE)); + assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(NaN, NaN, true), ID), Domain.all(DOUBLE)); - assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(3.3, NaN), ID, column, true), Domain.notNull(DOUBLE)); + assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(3.3, NaN), ID), Domain.notNull(DOUBLE)); - assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(3.3, NaN, true), ID, column, true), Domain.all(DOUBLE)); + assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(3.3, NaN, true), ID), Domain.all(DOUBLE)); assertEquals(getDomain(DOUBLE, doubleDictionaryDescriptor(NaN)), Domain.all(DOUBLE)); assertEquals(getDomain(DOUBLE, doubleDictionaryDescriptor(3.3, NaN)), Domain.all(DOUBLE)); - // ignore corrupted statistics - assertEquals(getDomain(DOUBLE, 10, doubleColumnStats(42.24, 3.3), ID, column, false), create(ValueSet.all(DOUBLE), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(DOUBLE, 10, doubleColumnStats(42.24, 3.3), ID, column, true)) - .withMessage("Corrupted statistics for column \"DoubleColumn\" in Parquet file \"testFile\": [min: 42.24, max: 3.3, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(42.24, 3.3), ID)) + .withMessage("Corrupted statistics for column \"[] required double DoubleColumn\" in Parquet file \"testFile\": [min: 42.24, max: 3.3, num_nulls: 0]"); } @Test public void testString() throws ParquetCorruptionException { - String column = "StringColumn"; - assertEquals(getDomain(createUnboundedVarcharType(), 0, null, ID, column, true), Domain.all(createUnboundedVarcharType())); + ColumnDescriptor columnDescriptor = createColumnDescriptor(BINARY, "StringColumn"); + assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 0, null, ID), Domain.all(createUnboundedVarcharType())); - assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("taco", "taco"), ID, column, true), singleValue(createUnboundedVarcharType(), utf8Slice("taco"))); + assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("taco", "taco"), ID), singleValue(createUnboundedVarcharType(), utf8Slice("taco"))); - assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("apple", "taco"), ID, column, true), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("apple"), true, utf8Slice("taco"), true)), false)); + assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("apple", "taco"), ID), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("apple"), true, utf8Slice("taco"), true)), false)); - assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("中国", "美利坚"), ID, column, true), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("中国"), true, utf8Slice("美利坚"), true)), false)); + assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("中国", "美利坚"), ID), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("中国"), true, utf8Slice("美利坚"), true)), false)); - // ignore corrupted statistics - assertEquals(getDomain(createUnboundedVarcharType(), 10, stringColumnStats("taco", "apple"), ID, column, false), create(ValueSet.all(createUnboundedVarcharType()), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(createUnboundedVarcharType(), 10, stringColumnStats("taco", "apple"), ID, column, true)) - .withMessage("Corrupted statistics for column \"StringColumn\" in Parquet file \"testFile\": [min: taco, max: apple, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("taco", "apple"), ID)) + .withMessage("Corrupted statistics for column \"[] required binary StringColumn\" in Parquet file \"testFile\": [min: taco, max: apple, num_nulls: 0]"); } private static Statistics stringColumnStats(String minimum, String maximum) @@ -248,53 +239,49 @@ private static Statistics stringColumnStats(String minimum, String maximum) public void testFloat() throws Exception { - String column = "FloatColumn"; - assertEquals(getDomain(REAL, 0, null, ID, column, true), Domain.all(REAL)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(FLOAT, "FloatColumn"); + assertEquals(getDomain(columnDescriptor, REAL, 0, null, ID), Domain.all(REAL)); float minimum = 4.3f; float maximum = 40.3f; - assertEquals(getDomain(REAL, 10, floatColumnStats(minimum, minimum), ID, column, true), singleValue(REAL, (long) floatToRawIntBits(minimum))); + assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, minimum), ID), singleValue(REAL, (long) floatToRawIntBits(minimum))); assertEquals( - getDomain(REAL, 10, floatColumnStats(minimum, maximum), ID, column, true), + getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, maximum), ID), create(ValueSet.ofRanges(range(REAL, (long) floatToRawIntBits(minimum), true, (long) floatToRawIntBits(maximum), true)), false)); - assertEquals(getDomain(REAL, 10, floatColumnStats(NaN, NaN), ID, column, true), Domain.notNull(REAL)); + assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(NaN, NaN), ID), Domain.notNull(REAL)); - assertEquals(getDomain(REAL, 10, floatColumnStats(NaN, NaN, true), ID, column, true), Domain.all(REAL)); + assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(NaN, NaN, true), ID), Domain.all(REAL)); - assertEquals(getDomain(REAL, 10, floatColumnStats(minimum, NaN), ID, column, true), Domain.notNull(REAL)); + assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, NaN), ID), Domain.notNull(REAL)); - assertEquals(getDomain(REAL, 10, floatColumnStats(minimum, NaN, true), ID, column, true), Domain.all(REAL)); + assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, NaN, true), ID), Domain.all(REAL)); assertEquals(getDomain(REAL, floatDictionaryDescriptor(NaN)), Domain.all(REAL)); assertEquals(getDomain(REAL, floatDictionaryDescriptor(minimum, NaN)), Domain.all(REAL)); - // ignore corrupted statistics - assertEquals(getDomain(REAL, 10, floatColumnStats(maximum, minimum), ID, column, false), create(ValueSet.all(REAL), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(REAL, 10, floatColumnStats(maximum, minimum), ID, column, true)) - .withMessage("Corrupted statistics for column \"FloatColumn\" in Parquet file \"testFile\": [min: 40.3, max: 4.3, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, REAL, 10, floatColumnStats(maximum, minimum), ID)) + .withMessage("Corrupted statistics for column \"[] required float FloatColumn\" in Parquet file \"testFile\": [min: 40.3, max: 4.3, num_nulls: 0]"); } @Test public void testDate() throws ParquetCorruptionException { - String column = "DateColumn"; - assertEquals(getDomain(DATE, 0, null, ID, column, true), Domain.all(DATE)); - assertEquals(getDomain(DATE, 10, intColumnStats(100, 100), ID, column, true), singleValue(DATE, 100L)); - assertEquals(getDomain(DATE, 10, intColumnStats(0, 100), ID, column, true), create(ValueSet.ofRanges(range(DATE, 0L, true, 100L, true)), false)); + ColumnDescriptor columnDescriptor = createColumnDescriptor(INT32, "DateColumn"); + assertEquals(getDomain(columnDescriptor, DATE, 0, null, ID), Domain.all(DATE)); + assertEquals(getDomain(columnDescriptor, DATE, 10, intColumnStats(100, 100), ID), singleValue(DATE, 100L)); + assertEquals(getDomain(columnDescriptor, DATE, 10, intColumnStats(0, 100), ID), create(ValueSet.ofRanges(range(DATE, 0L, true, 100L, true)), false)); - // ignore corrupted statistics - assertEquals(getDomain(DATE, 10, intColumnStats(200, 100), ID, column, false), create(ValueSet.all(DATE), false)); // fail on corrupted statistics assertThatExceptionOfType(ParquetCorruptionException.class) - .isThrownBy(() -> getDomain(DATE, 10, intColumnStats(200, 100), ID, column, true)) - .withMessage("Corrupted statistics for column \"DateColumn\" in Parquet file \"testFile\": [min: 200, max: 100, num_nulls: 0]"); + .isThrownBy(() -> getDomain(columnDescriptor, DATE, 10, intColumnStats(200, 100), ID)) + .withMessage("Corrupted statistics for column \"[] required int32 DateColumn\" in Parquet file \"testFile\": [min: 200, max: 100, num_nulls: 0]"); } @Test @@ -309,7 +296,7 @@ public void testVarcharMatchesWithStatistics() Statistics stats = getStatsBasedOnType(column.getType()); stats.setNumNulls(1L); stats.setMinMaxFromBytes(value.getBytes(), value.getBytes()); - assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID, true)); + assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID)); } @Test(dataProvider = "typeForParquetInt32") @@ -324,9 +311,9 @@ public void testIntegerMatchesWithStatistics(Type typeForParquetInt32) Domain.create(ValueSet.of(typeForParquetInt32, 42L, 43L, 44L, 112L), false))); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); - assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(32, 42)), ID, true)); - assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(30, 40)), ID, true)); - assertEquals(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(1024, 0x10000 + 42)), ID, true), (typeForParquetInt32 != INTEGER)); // stats invalid for smallint/tinyint + assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(32, 42)), ID)); + assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(30, 40)), ID)); + assertEquals(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(1024, 0x10000 + 42)), ID), (typeForParquetInt32 != INTEGER)); // stats invalid for smallint/tinyint } @DataProvider @@ -351,9 +338,9 @@ public void testBigintMatchesWithStatistics() Domain.create(ValueSet.of(BIGINT, 42L, 43L, 44L, 404L), false))); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); - assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(32, 42)), ID, true)); - assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(30, 40)), ID, true)); - assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(1024, 0x10000 + 42)), ID, true)); + assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(32, 42)), ID)); + assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(30, 40)), ID)); + assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(1024, 0x10000 + 42)), ID)); } @Test @@ -453,4 +440,9 @@ private static LongStatistics longOnlyNullsStats(long numNulls) statistics.setNumNulls(numNulls); return statistics; } + + private ColumnDescriptor createColumnDescriptor(PrimitiveTypeName typeName, String columnName) + { + return new ColumnDescriptor(new String[]{}, new PrimitiveType(REQUIRED, typeName, columnName), 0, 0); + } }