From 4b588f409bb41432ecb2d45ce3c7156b84f7ede9 Mon Sep 17 00:00:00 2001 From: Edgar Rodriguez Date: Fri, 7 Jun 2019 16:06:44 -0700 Subject: [PATCH 1/2] Add ORC support for listPartitions --- .../apache/iceberg/spark/SparkTableUtil.scala | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala index 86dfbe3772c8..08fec59f1366 100644 --- a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala +++ b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala @@ -73,6 +73,8 @@ object SparkTableUtil { listAvroPartition(partition, uri) } else if (format.contains("parquet")) { listParquetPartition(partition, uri) + } else if (format.contains("orc")) { + listOrcPartition(partition, uri) } else { throw new UnsupportedOperationException(s"Unknown partition format: $format") } @@ -248,5 +250,28 @@ object SparkTableUtil { bytesMapToArray(metrics.upperBounds)) } } + + private def listOrcPartition( + partitionPath: Map[String, String], + partitionUri: String): Seq[SparkDataFile] = { + val conf = new Configuration() + val partition = new Path(partitionUri) + val fs = partition.getFileSystem(conf) + + fs.listStatus(partition, HiddenPathFilter).filter(_.isFile).map { stat => + // TODO: add ORC metrics + SparkDataFile( + stat.getPath.toString, + partitionPath, "orc", stat.getLen, + stat.getBlockSize, + null, + null, + null, + null, + null, + null + ) + } + } } From f651eaecceae40fa4ca85ebb6025f2de8113c49b Mon Sep 17 00:00:00 2001 From: Edgar Rodriguez Date: Fri, 7 Jun 2019 16:27:03 -0700 Subject: [PATCH 2/2] Add OrcMetrics with row count only --- .../org/apache/iceberg/orc/OrcMetrics.java | 60 +++++++++++++++++++ .../apache/iceberg/spark/SparkTableUtil.scala | 18 +++--- 2 files changed, 71 insertions(+), 7 deletions(-) create mode 100644 orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java new file mode 100644 index 000000000000..2defc7d2fd74 --- /dev/null +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.orc; + +import java.io.IOException; +import java.util.Collections; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.hadoop.HadoopInputFile; +import org.apache.iceberg.io.InputFile; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; + +public class OrcMetrics { + + private OrcMetrics() {} + + public static Metrics fromInputFile(InputFile file) { + final Configuration config = (file instanceof HadoopInputFile) + ? ((HadoopInputFile)file).getConf() + : new Configuration(); + return fromInputFile(file, config); + } + + public static Metrics fromInputFile(InputFile file, Configuration config) { + try { + final Reader orcReader = OrcFile.createReader(new Path(file.location()), + OrcFile.readerOptions(config)); + + // TODO: implement rest of the methods for ORC metrics + return new Metrics(orcReader.getNumberOfRows(), + null, + null, + Collections.emptyMap(), + null, + null); + } catch (IOException ioe) { + throw new RuntimeIOException(ioe, "Failed to read footer of file: %s", file); + } + } +} diff --git a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala index 08fec59f1366..9c95916d3757 100644 --- a/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala +++ b/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala @@ -32,6 +32,9 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition import scala.collection.JavaConverters._ +import org.apache.iceberg.hadoop.HadoopInputFile +import org.apache.iceberg.orc.OrcMetrics + object SparkTableUtil { /** * Returns a DataFrame with a row for each partition in the table. @@ -259,17 +262,18 @@ object SparkTableUtil { val fs = partition.getFileSystem(conf) fs.listStatus(partition, HiddenPathFilter).filter(_.isFile).map { stat => - // TODO: add ORC metrics + val metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath, conf)) + SparkDataFile( stat.getPath.toString, partitionPath, "orc", stat.getLen, stat.getBlockSize, - null, - null, - null, - null, - null, - null + metrics.recordCount, + mapToArray(metrics.columnSizes), + mapToArray(metrics.valueCounts), + mapToArray(metrics.nullValueCounts), + bytesMapToArray(metrics.lowerBounds()), + bytesMapToArray(metrics.upperBounds()) ) } }