From 3fb25a38ee50f6a4adddff041781d426936e6f3d Mon Sep 17 00:00:00 2001
From: Shardul Mahadik <smahadik@linkedin.com>
Date: Fri, 20 Nov 2020 17:27:40 -0800
Subject: [PATCH] Hive Metadata Scan: Return empty statistics

---
 .../apache/iceberg/spark/source/Reader.java   | 21 +++++++++++++++++--
 .../iceberg/spark/source/SparkBatchScan.java  | 21 +++++++++++++++++--
 2 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java
index b2787e518b..6d5b0ab8d4 100644
--- a/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java
+++ b/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -24,6 +24,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.OptionalLong;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -280,8 +281,12 @@ public void pruneColumns(StructType newRequestedSchema) {
 
   @Override
   public Statistics estimateStatistics() {
-    if (!(table instanceof LegacyHiveTable) &&
-        (filterExpressions == null || filterExpressions == Expressions.alwaysTrue())) {
+    if (table instanceof LegacyHiveTable) {
+      // We currently don't have reliable stats for Hive tables
+      return EMPTY_STATS;
+    }
+
+    if (filterExpressions == null || filterExpressions == Expressions.alwaysTrue()) {
       long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(),
           SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
       return new Stats(SparkSchemaUtil.estimateSize(lazyType(), totalRecords), totalRecords);
@@ -300,6 +305,18 @@ public Statistics estimateStatistics() {
     return new Stats(sizeInBytes, numRows);
   }
 
+  private static final Statistics EMPTY_STATS = new Statistics() {
+    @Override
+    public OptionalLong sizeInBytes() {
+      return OptionalLong.empty();
+    }
+
+    @Override
+    public OptionalLong numRows() {
+      return OptionalLong.empty();
+    }
+  };
+
   @Override
   public boolean enableBatchRead() {
     if (readUsingBatch == null) {
diff --git a/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java b/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java
index ff67c8c731..97c5fead2e 100644
--- a/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java
+++ b/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java
@@ -23,6 +23,7 @@
 import java.io.Serializable;
 import java.util.Collection;
 import java.util.List;
+import java.util.OptionalLong;
 import java.util.stream.Collectors;
 import org.apache.iceberg.CombinedScanTask;
 import org.apache.iceberg.FileFormat;
@@ -184,8 +185,12 @@ public PartitionReaderFactory createReaderFactory() {
 
   @Override
   public Statistics estimateStatistics() {
-    if (!(table instanceof LegacyHiveTable) &&
-        (filterExpressions == null || filterExpressions.isEmpty())) {
+    if (table instanceof LegacyHiveTable) {
+      // We currently don't have reliable stats for Hive tables
+      return EMPTY_STATS;
+    }
+
+    if (filterExpressions == null || filterExpressions.isEmpty()) {
       LOG.debug("using table metadata to estimate table statistics");
       long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(),
           SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
@@ -208,6 +213,18 @@ public Statistics estimateStatistics() {
     return new Stats(sizeInBytes, numRows);
   }
 
+  private static final Statistics EMPTY_STATS = new Statistics() {
+    @Override
+    public OptionalLong sizeInBytes() {
+      return OptionalLong.empty();
+    }
+
+    @Override
+    public OptionalLong numRows() {
+      return OptionalLong.empty();
+    }
+  };
+
   private List<CombinedScanTask> tasks() {
     if (tasks == null) {
       TableScan scan = table