[SC-5819] Optimize the speed of writing metadata

zsxwing · zsxwing · commit 89dfc5cbb8ca · 2017-02-27T22:06:04.000-08:00
## What changes were proposed in this pull request? https://github.com/databricks/spark/commit/9afe6c18f2d472c0dc9f820871225b299872fcd7 was reverted because of the merge conflicts. This PR just added it back. ## How was this patch tested? Jenkins Author: Shixiong Zhu <shixiong@databricks.com> Closes apache#249 from zsxwing/opt-2.1.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.streaming
 
 import java.io._
 import java.nio.charset.StandardCharsets
-import java.util.{ConcurrentModificationException, EnumSet, UUID}
+import java.util.{ConcurrentModificationException, EnumSet, LinkedHashMap, UUID}
 
 import scala.reflect.ClassTag
 
@@ -93,6 +93,14 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     }
   }
 
+  /**
+   * Cache the latest two batches. [[StreamExecution]] usually just accesses the latest two batches
+   * when committing offsets, this cache will save some file system operations.
+   */
+  private val batchCache = new LinkedHashMap[Long, T](2) {
+    override def removeEldestEntry(e: java.util.Map.Entry[Long, T]): Boolean = size > 2
+  }
+
   /**
    * A `PathFilter` to filter only batch files
    */
@@ -133,6 +141,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
    * metadata has already been stored, this method will return `false`.
    */
   override def add(batchId: Long, metadata: T): Boolean = {
+    assert(metadata != null)
     get(batchId).map(_ => false).getOrElse {
       // Only write metadata when the batch has not yet been written
       runUninterruptiblyIfLocal {
@@ -142,17 +151,21 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     }
   }
 
-  private def writeTempBatch(metadata: T): Option[Path] = {
+  private def writeBatchToFile(metadata: T, path: Path): Unit = {
+    val output = fileManager.create(path)
+    try {
+      serialize(metadata, output)
+    } finally {
+      IOUtils.closeQuietly(output)
+    }
+  }
+
+  private def writeTempBatch(metadata: T): Path = {
     while (true) {
       val tempPath = new Path(metadataPath, s".${UUID.randomUUID.toString}.tmp")
       try {
-        val output = fileManager.create(tempPath)
-        try {
-          serialize(metadata, output)
-          return Some(tempPath)
-        } finally {
-          IOUtils.closeQuietly(output)
-        }
+        writeBatchToFile(metadata, tempPath)
+        return tempPath
       } catch {
         case e: IOException if isFileAlreadyExistsException(e) =>
           // Failed to create "tempPath". There are two cases:
@@ -169,7 +182,8 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
           // malicious checkpoint files to crash a Streaming application too.
       }
     }
-    None
+    assert(false, "should not happen")
+    null
   }
 
   /**
@@ -179,18 +193,21 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
    * valid behavior, we still need to prevent it from destroying the files.
    */
   private def writeBatch(batchId: Long, metadata: T): Unit = {
-    val tempPath = writeTempBatch(metadata).getOrElse(
-      throw new IllegalStateException(s"Unable to create temp batch file $batchId"))
+    if (!fileManager.supportsAtomicRename) {
+      // The underlying file system implementation doesn't support atomic rename, so writing to the
+      // target path directly.
+      writeBatchToFile(metadata, batchIdToPath(batchId))
+      batchCache.put(batchId, metadata)
+      return
+    }
+
+    val tempPath = writeTempBatch(metadata)
     try {
       // Try to commit the batch
       // It will fail if there is an existing file (someone has committed the batch)
       logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
       fileManager.rename(tempPath, batchIdToPath(batchId))
-
-      // SPARK-17475: HDFSMetadataLog should not leak CRC files
-      // If the underlying filesystem didn't rename the CRC file, delete it.
-      val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc")
-      if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
+      batchCache.put(batchId, metadata)
     } catch {
       case e: IOException if isFileAlreadyExistsException(e) =>
         // If "rename" fails, it means some other "HDFSMetadataLog" has committed the batch.
@@ -200,6 +217,11 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     } finally {
       fileManager.delete(tempPath)
     }
+
+    // SPARK-17475: HDFSMetadataLog should not leak CRC files
+    // If the underlying filesystem didn't rename the CRC file, delete it.
+    val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc")
+    if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
   }
 
   private def isFileAlreadyExistsException(e: IOException): Boolean = {
@@ -213,7 +235,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
    * @return the deserialized metadata in a batch file, or None if file not exist.
    * @throws IllegalArgumentException when path does not point to a batch file.
    */
-  def get(batchFile: Path): Option[T] = {
+  private def get(batchFile: Path): Option[T] = {
     if (fileManager.exists(batchFile)) {
       if (isBatchFile(batchFile)) {
         get(pathToBatchId(batchFile))
@@ -226,6 +248,12 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
   }
 
   override def get(batchId: Long): Option[T] = {
+    if (batchCache.containsKey(batchId)) {
+      val metadata = batchCache.get(batchId)
+      assert(metadata != null)
+      return Some(metadata)
+    }
+
     val batchMetadataFile = batchIdToPath(batchId)
     if (fileManager.exists(batchMetadataFile)) {
       val input = fileManager.open(batchMetadataFile)
@@ -267,17 +295,6 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     None
   }
 
-  /**
-   * Get an array of [FileStatus] referencing batch files.
-   * The array is sorted by most recent batch file first to
-   * oldest batch file.
-   */
-  def getOrderedBatchFiles(): Array[FileStatus] = {
-    fileManager.list(metadataPath, batchFilesFilter)
-      .sortBy(f => pathToBatchId(f.getPath))
-      .reverse
-  }
-
   /**
    * Removes all the log entry earlier than thresholdBatchId (exclusive).
    */
@@ -288,6 +305,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     for (batchId <- batchIds if batchId < thresholdBatchId) {
       val path = batchIdToPath(batchId)
       fileManager.delete(path)
+      batchCache.remove(batchId)
       logTrace(s"Removed metadata log file: $path")
     }
   }
@@ -326,6 +344,9 @@ object HDFSMetadataLog {
     /** Create path, or throw exception if it already exists */
     def create(path: Path): FSDataOutputStream
 
+    /** Wheter the file system supports atomic rename. */
+    def supportsAtomicRename: Boolean = true
+
     /**
      * Atomically rename path, or throw exception if it cannot be done.
      * Should throw FileNotFoundException if srcPath does not exist.
@@ -336,7 +357,7 @@ object HDFSMetadataLog {
     /** Recursively delete a path if it exists. Should not throw exception if file doesn't exist. */
     def delete(path: Path): Unit
 
-    /** Whether the file systme is a local FS. */
+    /** Whether the file system is a local FS. */
     def isLocalFileSystem: Boolean
   }
 
@@ -354,6 +375,8 @@ object HDFSMetadataLog {
       fc.util.listStatus(path, filter)
     }
 
+    override def supportsAtomicRename: Boolean = true
+
     override def rename(srcPath: Path, destPath: Path): Unit = {
       fc.rename(srcPath, destPath)
     }
@@ -403,6 +426,8 @@ object HDFSMetadataLog {
       fs.listStatus(path, filter)
     }
 
+    override def supportsAtomicRename: Boolean = false
+
     /**
      * Rename a path. Note that this implementation is not atomic.
      * @throws FileNotFoundException if source path does not exist.