From 4b588f409bb41432ecb2d45ce3c7156b84f7ede9 Mon Sep 17 00:00:00 2001
From: Edgar Rodriguez <edgar.rd@gmail.com>
Date: Fri, 7 Jun 2019 16:06:44 -0700
Subject: [PATCH 1/2] Add ORC support for listPartitions

---
 .../apache/iceberg/spark/SparkTableUtil.scala | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
index 86dfbe3772c8..08fec59f1366 100644
--- a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
+++ b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
@@ -73,6 +73,8 @@ object SparkTableUtil {
       listAvroPartition(partition, uri)
     } else if (format.contains("parquet")) {
       listParquetPartition(partition, uri)
+    } else if (format.contains("orc")) {
+      listOrcPartition(partition, uri)
     } else {
       throw new UnsupportedOperationException(s"Unknown partition format: $format")
     }
@@ -248,5 +250,28 @@ object SparkTableUtil {
         bytesMapToArray(metrics.upperBounds))
     }
   }
+
+  private def listOrcPartition(
+      partitionPath: Map[String, String],
+      partitionUri: String): Seq[SparkDataFile] = {
+    val conf = new Configuration()
+    val partition = new Path(partitionUri)
+    val fs = partition.getFileSystem(conf)
+
+    fs.listStatus(partition, HiddenPathFilter).filter(_.isFile).map { stat =>
+      // TODO: add ORC metrics
+      SparkDataFile(
+        stat.getPath.toString,
+        partitionPath, "orc", stat.getLen,
+        stat.getBlockSize,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null
+      )
+    }
+  }
 }
 

From f651eaecceae40fa4ca85ebb6025f2de8113c49b Mon Sep 17 00:00:00 2001
From: Edgar Rodriguez <edgar.rd@gmail.com>
Date: Fri, 7 Jun 2019 16:27:03 -0700
Subject: [PATCH 2/2] Add OrcMetrics with row count only

---
 .../org/apache/iceberg/orc/OrcMetrics.java    | 60 +++++++++++++++++++
 .../apache/iceberg/spark/SparkTableUtil.scala | 18 +++---
 2 files changed, 71 insertions(+), 7 deletions(-)
 create mode 100644 orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java

diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
new file mode 100644
index 000000000000..2defc7d2fd74
--- /dev/null
+++ b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.orc;
+
+import java.io.IOException;
+import java.util.Collections;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.iceberg.Metrics;
+import org.apache.iceberg.exceptions.RuntimeIOException;
+import org.apache.iceberg.hadoop.HadoopInputFile;
+import org.apache.iceberg.io.InputFile;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+
+public class OrcMetrics {
+
+  private OrcMetrics() {}
+
+  public static Metrics fromInputFile(InputFile file) {
+    final Configuration config = (file instanceof HadoopInputFile)
+        ? ((HadoopInputFile)file).getConf()
+        : new Configuration();
+    return fromInputFile(file, config);
+  }
+
+  public static Metrics fromInputFile(InputFile file, Configuration config) {
+    try {
+      final Reader orcReader = OrcFile.createReader(new Path(file.location()),
+          OrcFile.readerOptions(config));
+
+      // TODO: implement rest of the methods for ORC metrics
+      return new Metrics(orcReader.getNumberOfRows(),
+          null,
+          null,
+          Collections.emptyMap(),
+          null,
+          null);
+    } catch (IOException ioe) {
+      throw new RuntimeIOException(ioe, "Failed to read footer of file: %s", file);
+    }
+  }
+}
diff --git a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
index 08fec59f1366..9c95916d3757 100644
--- a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
+++ b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
@@ -32,6 +32,9 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition
 import scala.collection.JavaConverters._
 
+import org.apache.iceberg.hadoop.HadoopInputFile
+import org.apache.iceberg.orc.OrcMetrics
+
 object SparkTableUtil {
   /**
    * Returns a DataFrame with a row for each partition in the table.
@@ -259,17 +262,18 @@ object SparkTableUtil {
     val fs = partition.getFileSystem(conf)
 
     fs.listStatus(partition, HiddenPathFilter).filter(_.isFile).map { stat =>
-      // TODO: add ORC metrics
+      val metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath, conf))
+
       SparkDataFile(
         stat.getPath.toString,
         partitionPath, "orc", stat.getLen,
         stat.getBlockSize,
-        null,
-        null,
-        null,
-        null,
-        null,
-        null
+        metrics.recordCount,
+        mapToArray(metrics.columnSizes),
+        mapToArray(metrics.valueCounts),
+        mapToArray(metrics.nullValueCounts),
+        bytesMapToArray(metrics.lowerBounds()),
+        bytesMapToArray(metrics.upperBounds())
       )
     }
   }