From 3fb25a38ee50f6a4adddff041781d426936e6f3d Mon Sep 17 00:00:00 2001 From: Shardul Mahadik Date: Fri, 20 Nov 2020 17:27:40 -0800 Subject: [PATCH] Hive Metadata Scan: Return empty statistics --- .../apache/iceberg/spark/source/Reader.java | 21 +++++++++++++++++-- .../iceberg/spark/source/SparkBatchScan.java | 21 +++++++++++++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java index b2787e518b..6d5b0ab8d4 100644 --- a/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java +++ b/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.OptionalLong; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -280,8 +281,12 @@ public void pruneColumns(StructType newRequestedSchema) { @Override public Statistics estimateStatistics() { - if (!(table instanceof LegacyHiveTable) && - (filterExpressions == null || filterExpressions == Expressions.alwaysTrue())) { + if (table instanceof LegacyHiveTable) { + // We currently don't have reliable stats for Hive tables + return EMPTY_STATS; + } + + if (filterExpressions == null || filterExpressions == Expressions.alwaysTrue()) { long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); return new Stats(SparkSchemaUtil.estimateSize(lazyType(), totalRecords), totalRecords); @@ -300,6 +305,18 @@ public Statistics estimateStatistics() { return new Stats(sizeInBytes, numRows); } + private static final Statistics EMPTY_STATS = new Statistics() { + @Override + public OptionalLong sizeInBytes() { + return OptionalLong.empty(); + } + + @Override + public OptionalLong numRows() { + return OptionalLong.empty(); + } + }; + @Override public boolean enableBatchRead() { if (readUsingBatch == null) { diff --git a/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java b/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java index ff67c8c731..97c5fead2e 100644 --- a/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java +++ b/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java @@ -23,6 +23,7 @@ import java.io.Serializable; import java.util.Collection; import java.util.List; +import java.util.OptionalLong; import java.util.stream.Collectors; import org.apache.iceberg.CombinedScanTask; import org.apache.iceberg.FileFormat; @@ -184,8 +185,12 @@ public PartitionReaderFactory createReaderFactory() { @Override public Statistics estimateStatistics() { - if (!(table instanceof LegacyHiveTable) && - (filterExpressions == null || filterExpressions.isEmpty())) { + if (table instanceof LegacyHiveTable) { + // We currently don't have reliable stats for Hive tables + return EMPTY_STATS; + } + + if (filterExpressions == null || filterExpressions.isEmpty()) { LOG.debug("using table metadata to estimate table statistics"); long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); @@ -208,6 +213,18 @@ public Statistics estimateStatistics() { return new Stats(sizeInBytes, numRows); } + private static final Statistics EMPTY_STATS = new Statistics() { + @Override + public OptionalLong sizeInBytes() { + return OptionalLong.empty(); + } + + @Override + public OptionalLong numRows() { + return OptionalLong.empty(); + } + }; + private List tasks() { if (tasks == null) { TableScan scan = table