apache · garyli1019 · Jun 9, 2020 · Jun 30, 2020 · vinothchandar · Jun 23, 2020
diff --git a/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala
@@ -21,6 +21,7 @@ import org.apache.hudi.common.model.HoodieTableType
 import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload
 import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
 import org.apache.hudi.keygen.SimpleKeyGenerator
+
 import org.apache.log4j.LogManager
 
 /**
@@ -65,7 +66,7 @@ object DataSourceReadOptions {
     * This eases migration from old configs to new configs.
     */
   def translateViewTypesToQueryTypes(optParams: Map[String, String]) : Map[String, String] = {
-    val translation = Map(VIEW_TYPE_READ_OPTIMIZED_OPT_VAL -> QUERY_TYPE_SNAPSHOT_OPT_VAL,
+    val translation = Map(VIEW_TYPE_READ_OPTIMIZED_OPT_VAL -> QUERY_TYPE_READ_OPTIMIZED_OPT_VAL,
                           VIEW_TYPE_INCREMENTAL_OPT_VAL -> QUERY_TYPE_INCREMENTAL_OPT_VAL,
                           VIEW_TYPE_REALTIME_OPT_VAL -> QUERY_TYPE_SNAPSHOT_OPT_VAL)
     if (optParams.contains(VIEW_TYPE_OPT_KEY) && !optParams.contains(QUERY_TYPE_OPT_KEY)) {

diff --git a/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
@@ -18,7 +18,9 @@
 package org.apache.hudi
 
 import org.apache.hudi.DataSourceReadOptions._
-import org.apache.hudi.exception.HoodieException
+import org.apache.hudi.common.model.HoodieTableType
+import org.apache.hudi.common.table.HoodieTableMetaClient
+import org.apache.hudi.exception.{HoodieException, TableNotFoundException}
 import org.apache.hudi.hadoop.HoodieROTablePathFilter
 import org.apache.log4j.LogManager
 import org.apache.spark.sql.execution.datasources.DataSource
@@ -58,26 +60,28 @@ class DefaultSource extends RelationProvider
       throw new HoodieException("'path' must be specified.")
     }
 
+    // Try to create hoodie table meta client from the give path
+    // TODO: Smarter path handling
+    val metaClient = try {
+      val conf = sqlContext.sparkContext.hadoopConfiguration
+      Option(new HoodieTableMetaClient(conf, path.get, true))
+    } catch {
+      case e: HoodieException => Option.empty
+    }
+
     if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_SNAPSHOT_OPT_VAL)) {
-      // this is just effectively RO view only, where `path` can contain a mix of
-      // non-hoodie/hoodie path files. set the path filter up
-      sqlContext.sparkContext.hadoopConfiguration.setClass(
-        "mapreduce.input.pathFilter.class",
-        classOf[HoodieROTablePathFilter],
-        classOf[org.apache.hadoop.fs.PathFilter])
-
-      log.info("Constructing hoodie (as parquet) data source with options :" + parameters)
-      log.warn("Snapshot view not supported yet via data source, for MERGE_ON_READ tables. " +
-        "Please query the Hive table registered using Spark SQL.")
-      // simply return as a regular parquet relation
-      DataSource.apply(
-        sparkSession = sqlContext.sparkSession,
-        userSpecifiedSchema = Option(schema),
-        className = "parquet",
-        options = parameters)
-        .resolveRelation()
+      if (metaClient.isDefined && metaClient.get.getTableType.equals(HoodieTableType.MERGE_ON_READ)) {
+        new SnapshotRelation(sqlContext, path.get, optParams, schema, metaClient.get)
+      } else {
+        getReadOptimizedView(sqlContext, parameters, schema)
+      }
+    } else if(parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)) {
+      getReadOptimizedView(sqlContext, parameters, schema)
     } else if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_INCREMENTAL_OPT_VAL)) {
-      new IncrementalRelation(sqlContext, path.get, optParams, schema)
+      if (metaClient.isEmpty) {
+        throw new TableNotFoundException(path.get)
+      }
+      new IncrementalRelation(sqlContext, path.get, optParams, schema, metaClient.get)
     } else {
       throw new HoodieException("Invalid query type :" + parameters(QUERY_TYPE_OPT_KEY))
     }
@@ -123,4 +127,25 @@ class DefaultSource extends RelationProvider
   }
 
   override def shortName(): String = "hudi"
+
+  private def getReadOptimizedView(sqlContext: SQLContext,
+                                   optParams: Map[String, String],
+                                   schema: StructType): BaseRelation = {
+    log.warn("Loading Read Optimized view.")
+    // this is just effectively RO view only, where `path` can contain a mix of
+    // non-hoodie/hoodie path files. set the path filter up
+    sqlContext.sparkContext.hadoopConfiguration.setClass(
+      "mapreduce.input.pathFilter.class",
+      classOf[HoodieROTablePathFilter],
+      classOf[org.apache.hadoop.fs.PathFilter])
+
+    log.info("Constructing hoodie (as parquet) data source with options :" + optParams)
+    // simply return as a regular parquet relation
+    DataSource.apply(
+      sparkSession = sqlContext.sparkSession,
+      userSpecifiedSchema = Option(schema),
+      className = "parquet",
+      options = optParams)
+      .resolveRelation()
+  }
 }
diff --git a/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala b/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala
@@ -43,11 +43,11 @@ import scala.collection.mutable
 class IncrementalRelation(val sqlContext: SQLContext,
                           val basePath: String,
                           val optParams: Map[String, String],
-                          val userSchema: StructType) extends BaseRelation with TableScan {
+                          val userSchema: StructType,
+                          val metaClient: HoodieTableMetaClient) extends BaseRelation with TableScan {
 
   private val log = LogManager.getLogger(classOf[IncrementalRelation])
 
-  private val metaClient = new HoodieTableMetaClient(sqlContext.sparkContext.hadoopConfiguration, basePath, true)
   // MOR tables not supported yet
   if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) {
     throw new HoodieException("Incremental view not implemented yet, for merge-on-read tables")

diff --git a/hudi-spark/src/main/scala/org/apache/hudi/SnapshotRelation.scala b/hudi-spark/src/main/scala/org/apache/hudi/SnapshotRelation.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi
+
+import org.apache.hudi.avro.HoodieAvroUtils
+import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
+import org.apache.hudi.common.table.timeline.HoodieTimeline
+import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.hadoop.{HoodieParquetInputFormat, HoodieROTablePathFilter}
+import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils
+import org.apache.hudi.exception.HoodieException
+import org.apache.hudi.table.HoodieTable
+
+import org.apache.hadoop.mapred.JobConf
+import org.apache.log4j.LogManager
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
+import org.apache.spark.sql.types.StructType
+
+import java.util
+import scala.collection.JavaConverters._
+
+/**
+ * This is the Spark DataSourceV1 relation to read Hudi MOR table.
+ * @param sqlContext
+ * @param basePath
+ * @param optParams
+ * @param userSchema
+ */
+class SnapshotRelation(val sqlContext: SQLContext,
+                       val basePath: String,
+                       val optParams: Map[String, String],
+                       val userSchema: StructType,
+                       val metaClient: HoodieTableMetaClient) extends BaseRelation with PrunedFilteredScan {
+
+  private val log = LogManager.getLogger(classOf[SnapshotRelation])
+  private val conf = sqlContext.sparkContext.hadoopConfiguration
+
+  // Load Hudi table
+  private val hoodieTable = HoodieTable.create(metaClient, HoodieWriteConfig.newBuilder().withPath(basePath).build(), conf)
+  private val commitTimeline = hoodieTable.getMetaClient.getCommitsAndCompactionTimeline
+  if (commitTimeline.empty()) {
+    throw new HoodieException("No Valid Hudi timeline exists")
+  }
+  private val completedCommitTimeline = hoodieTable.getMetaClient.getCommitsTimeline.filterCompletedInstants()
+  private val lastInstant = completedCommitTimeline.lastInstant().get()
+
+  // Set config for listStatus() in HoodieParquetInputFormat
+  conf.setClass(
+    "mapreduce.input.pathFilter.class",
+    classOf[HoodieROTablePathFilter],
+    classOf[org.apache.hadoop.fs.PathFilter])
+  conf.setStrings("mapreduce.input.fileinputformat.inputdir", basePath)
+  conf.setStrings("mapreduce.input.fileinputformat.input.dir.recursive", "true")
+  conf.setStrings("hoodie.realtime.last.commit", lastInstant.getTimestamp)
+
+  private val hoodieInputFormat = new HoodieParquetInputFormat
+  hoodieInputFormat.setConf(conf)
+
+  // List all parquet files
+  private val fileStatus = hoodieInputFormat.listStatus(new JobConf(conf))
+
+  val (parquetPaths, parquetWithLogPaths) = if (lastInstant.getAction.equals(HoodieTimeline.COMMIT_ACTION)
+    || lastInstant.getAction.equals(HoodieTimeline.COMPACTION_ACTION)) {
+    (fileStatus.map(f => f.getPath.toString).toList, Map.empty[String, String])
+  } else {
+    val fileGroups = HoodieRealtimeInputFormatUtils.groupLogsByBaseFile(conf, util.Arrays.stream(fileStatus)).asScala
+    // Split the file group to: parquet file without a matching log file, parquet file need to merge with log files
+    val parquetPaths: List[String] = fileGroups.filter(p => p._2.size() == 0).keys.toList
+    val parquetWithLogPaths: Map[String, String] = fileGroups
+      .filter(p => p._2.size() > 0)
+      .map{ case(k, v) => (k, v.asScala.toList.mkString(","))}
+      .toMap
+    (parquetPaths, parquetWithLogPaths)
+  }
+
+  if (log.isDebugEnabled) {
+    log.debug("Stand alone parquet files: \n" + parquetPaths.mkString("\n"))
+    log.debug("Parquet files that have matching log files: \n" + parquetWithLogPaths.map(m => s"${m._1}:${m._2}").mkString("\n"))
+  }
+
+  // Add log file map to options
+  private val finalOps = optParams ++ parquetWithLogPaths
+
+  // use schema from latest metadata, if not present, read schema from the data file
+  private val latestSchema = {
+    val schemaUtil = new TableSchemaResolver(metaClient)
+    val tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaUtil.getTableAvroSchemaWithoutMetadataFields);
+    AvroConversionUtils.convertAvroSchemaToStructType(tableSchema)
+  }
+
+  override def schema: StructType = latestSchema
+
+  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
+    if (parquetWithLogPaths.isEmpty) {
+      sqlContext
+        .read
+        .options(finalOps)
+        .schema(schema)
+        .format("parquet")
+        .load(parquetPaths:_*)
+        .selectExpr(requiredColumns:_*)
+        .rdd
+    } else {
+      val regularParquet = sqlContext
+        .read
+        .options(finalOps)
+        .schema(schema)
+        .format("parquet")
+        .load(parquetPaths:_*)
+      // Hudi parquet files needed to merge with log file
+      sqlContext
+        .read
+        .options(finalOps)
+        .schema(schema)
+        .format("org.apache.spark.sql.execution.datasources.parquet.HoodieParquetRealtimeFileFormat")
+        .load(parquetWithLogPaths.keys.toList: _*)
+        .union(regularParquet)
+        .rdd
+    }
+  }
+}