Finish the PR

zsxwing · zsxwing · commit dab44edce2cb · 2017-04-27T13:45:06.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -272,4 +272,10 @@ package object config {
       .booleanConf
       .createWithDefault(false)
 
+  private[spark] val CHECKPOINT_COMPRESS =
+    ConfigBuilder("spark.checkpoint.compress")
+      .doc("Whether to compress RDD checkpoints. Generally a good idea. Compression will use " +
+        "spark.io.compression.codec.")
+      .booleanConf
+      .createWithDefault(false)
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.rdd
 
-import java.io.{FileNotFoundException, InputStream, IOException, OutputStream}
+import java.io.{FileNotFoundException, IOException}
+import java.util.concurrent.TimeUnit
 
 import scala.reflect.ClassTag
 import scala.util.control.NonFatal
@@ -27,11 +28,10 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.CHECKPOINT_COMPRESS
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
-
-
 /**
  * An RDD that reads from checkpoint files previously written to reliable storage.
  */
@@ -122,6 +122,7 @@ private[spark] object ReliableCheckpointRDD extends Logging {
       originalRDD: RDD[T],
       checkpointDir: String,
       blockSize: Int = -1): ReliableCheckpointRDD[T] = {
+    val checkpointStartTimeNs = System.nanoTime()
 
     val sc = originalRDD.sparkContext
 
@@ -136,18 +137,17 @@ private[spark] object ReliableCheckpointRDD extends Logging {
     val broadcastedConf = sc.broadcast(
       new SerializableConfiguration(sc.hadoopConfiguration))
     // TODO: This is expensive because it computes the RDD again unnecessarily (SPARK-8582)
-    val startTime = System.currentTimeMillis()
     sc.runJob(originalRDD,
       writePartitionToCheckpointFile[T](checkpointDirPath.toString, broadcastedConf) _)
 
-    logInfo(s"Checkpointing took ${System.currentTimeMillis() - startTime} ms.")
-    sc.conf.getOption("spark.checkpoint.compress.codec").foreach(codec => {
-      logInfo(s"The checkpoint compression codec is $codec.")
-    })
     if (originalRDD.partitioner.nonEmpty) {
       writePartitionerToCheckpointDir(sc, originalRDD.partitioner.get, checkpointDirPath)
     }
 
+    val checkpointDurationMs =
+      TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - checkpointStartTimeNs)
+    logInfo(s"Checkpointing took $checkpointDurationMs ms.")
+
     val newRDD = new ReliableCheckpointRDD[T](
       sc, checkpointDirPath.toString, originalRDD.partitioner)
     if (newRDD.partitions.length != originalRDD.partitions.length) {
@@ -164,7 +164,7 @@ private[spark] object ReliableCheckpointRDD extends Logging {
   def writePartitionToCheckpointFile[T: ClassTag](
       path: String,
       broadcastedConf: Broadcast[SerializableConfiguration],
-      blockSize: Int = -1)(ctx: TaskContext, iterator: Iterator[T]): Unit = {
+      blockSize: Int = -1)(ctx: TaskContext, iterator: Iterator[T]) {
     val env = SparkEnv.get
     val outputDir = new Path(path)
     val fs = outputDir.getFileSystem(broadcastedConf.value.value)
@@ -177,13 +177,11 @@ private[spark] object ReliableCheckpointRDD extends Logging {
     val bufferSize = env.conf.getInt("spark.buffer.size", 65536)
 
     val fileOutputStream = if (blockSize < 0) {
-      lazy val fileStream: OutputStream = fs.create(tempOutputPath, false, bufferSize)
-      env.conf.getOption("spark.checkpoint.compress.codec").fold(fileStream) {
-        codec => {
-          logDebug(s"Compressing using $codec.")
-          CompressionCodec.createCodec(env.conf, codec)
-            .compressedOutputStream(fileStream)
-        }
+      val fileStream = fs.create(tempOutputPath, false, bufferSize)
+      if (env.conf.get(CHECKPOINT_COMPRESS)) {
+        CompressionCodec.createCodec(env.conf).compressedOutputStream(fileStream)
+      } else {
+        fileStream
       }
     } else {
       // This is mainly for testing purpose
@@ -192,8 +190,6 @@ private[spark] object ReliableCheckpointRDD extends Logging {
     }
     val serializer = env.serializer.newInstance()
     val serializeStream = serializer.serializeStream(fileOutputStream)
-    logTrace(s"Starting to write to checkpoint file $tempOutputPath.")
-    val startTimeMs = System.currentTimeMillis()
     Utils.tryWithSafeFinally {
       serializeStream.writeAll(iterator)
     } {
@@ -214,7 +210,6 @@ private[spark] object ReliableCheckpointRDD extends Logging {
         }
       }
     }
-    logInfo(s"Checkpointing took ${System.currentTimeMillis() - startTimeMs} ms.")
   }
 
   /**
@@ -291,17 +286,16 @@ private[spark] object ReliableCheckpointRDD extends Logging {
     val env = SparkEnv.get
     val fs = path.getFileSystem(broadcastedConf.value.value)
     val bufferSize = env.conf.getInt("spark.buffer.size", 65536)
-    lazy val fileStream: InputStream = fs.open(path, bufferSize)
-    val inputStream: InputStream =
-      env.conf.getOption("spark.checkpoint.compress.codec").fold(fileStream) {
-        codec => {
-          logDebug(s"Decompressing using $codec.")
-          CompressionCodec.createCodec(env.conf, codec)
-            .compressedInputStream(fileStream)
-        }
+    val fileInputStream = {
+      val fileStream = fs.open(path, bufferSize)
+      if (env.conf.get(CHECKPOINT_COMPRESS)) {
+        CompressionCodec.createCodec(env.conf).compressedInputStream(fileStream)
+      } else {
+        fileStream
       }
+    }
     val serializer = env.serializer.newInstance()
-    val deserializeStream = serializer.deserializeStream(inputStream)
+    val deserializeStream = serializer.deserializeStream(fileInputStream)
 
     // Register an on-task-completion callback to close the input stream.
     context.addTaskCompletionListener(context => deserializeStream.close())
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -29,7 +29,6 @@ import org.apache.spark.rdd._
 import org.apache.spark.storage.{BlockId, StorageLevel, TestBlockId}
 import org.apache.spark.util.Utils
 
-
 trait RDDCheckpointTester { self: SparkFunSuite =>
 
   protected val partitioner = new HashPartitioner(2)
@@ -241,42 +240,6 @@ trait RDDCheckpointTester { self: SparkFunSuite =>
   protected def generateFatPairRDD(): RDD[(Int, Int)] = {
     new FatPairRDD(sparkContext.makeRDD(1 to 100, 4), partitioner).mapValues(x => x)
   }
-
-  protected def testBasicCheckpoint(sc: SparkContext, reliableCheckpoint: Boolean): Unit = {
-    val parCollection = sc.makeRDD(1 to 4)
-    val flatMappedRDD = parCollection.flatMap(x => 1 to x)
-    checkpoint(flatMappedRDD, reliableCheckpoint)
-    assert(flatMappedRDD.dependencies.head.rdd === parCollection)
-    val result = flatMappedRDD.collect()
-    assert(flatMappedRDD.dependencies.head.rdd != parCollection)
-    assert(flatMappedRDD.collect() === result)
-  }
-
-  protected def testCompression(checkpointDir: File, compressionCodec: String): Unit = {
-    val sparkConf = new SparkConf()
-    sparkConf.set("spark.checkpoint.compress.codec", compressionCodec)
-    val sc = new SparkContext("local", "test", sparkConf)
-    sc.setCheckpointDir(checkpointDir.toString)
-    val initialSize = 20
-    // Use just one partition for now since compression works best on large data sets.
-    val collection = sc.makeRDD(1 to initialSize, numSlices = 1)
-    val flatMappedRDD = collection.flatMap(x => 1 to x)
-    checkpoint(flatMappedRDD, reliableCheckpoint = true)
-    assert(flatMappedRDD.collect().length == initialSize * (initialSize + 1)/2,
-      "The checkpoint was lossy!")
-    sc.stop()
-    val checkpointPath = new Path(flatMappedRDD.getCheckpointFile.get)
-    val fs = checkpointPath.getFileSystem(sc.hadoopConfiguration)
-    val fileStatus = fs.listStatus(checkpointPath).find(_.getPath.getName.startsWith("part-")).get
-    val compressedSize = fileStatus.getLen
-    assert(compressedSize > 0, "The checkpoint file was not written!")
-    val compressedInputStream = CompressionCodec.createCodec(sparkConf, compressionCodec)
-      .compressedInputStream(fs.open(fileStatus.getPath))
-    val uncompressedSize = ByteStreams.toByteArray(compressedInputStream).length
-    compressedInputStream.close()
-    assert(compressedSize < uncompressedSize, "The compression was not successful!")
-  }
-
 }
 
 /**
@@ -290,14 +253,10 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
     super.beforeEach()
     checkpointDir = File.createTempFile("temp", "", Utils.createTempDir())
     checkpointDir.delete()
-  }
-
-  private def startSparkContext(): Unit = {
     sc = new SparkContext("local", "test")
     sc.setCheckpointDir(checkpointDir.toString)
   }
 
-
   override def afterEach(): Unit = {
     try {
       Utils.deleteRecursively(checkpointDir)
@@ -309,44 +268,13 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   override def sparkContext: SparkContext = sc
 
   runTest("basic checkpointing") { reliableCheckpoint: Boolean =>
-    startSparkContext()
-    testBasicCheckpoint(sc, reliableCheckpoint)
-  }
-
-  runTest("compression with snappy", skipLocalCheckpoint = true) { _: Boolean =>
-    val sparkConf = new SparkConf()
-    sparkConf.set("spark.checkpoint.compress.codec", "snappy")
-    sc = new SparkContext("local", "test", sparkConf)
-    sc.setCheckpointDir(checkpointDir.toString)
-    testBasicCheckpoint(sc, reliableCheckpoint = true)
-  }
-
-  runTest("compression with lz4", skipLocalCheckpoint = true) { _: Boolean =>
-    val sparkConf = new SparkConf()
-    sparkConf.set("spark.checkpoint.compress.codec", "lz4")
-    sc = new SparkContext("local", "test", sparkConf)
-    sc.setCheckpointDir(checkpointDir.toString)
-    testBasicCheckpoint(sc, reliableCheckpoint = true)
-  }
-
-  runTest("compression with lzf", skipLocalCheckpoint = true) { _: Boolean =>
-    val sparkConf = new SparkConf()
-    sparkConf.set("spark.checkpoint.compress.codec", "lzf")
-    sc = new SparkContext("local", "test", sparkConf)
-    sc.setCheckpointDir(checkpointDir.toString)
-    testBasicCheckpoint(sc, reliableCheckpoint = true)
-  }
-
-  runTest("compression size snappy", skipLocalCheckpoint = true) { _: Boolean =>
-    testCompression(checkpointDir, "snappy")
-  }
-
-  runTest("compression size lzf", skipLocalCheckpoint = true) { _: Boolean =>
-    testCompression(checkpointDir, "lzf")
-  }
-
-  runTest("compression size lz4", skipLocalCheckpoint = true) { _: Boolean =>
-    testCompression(checkpointDir, "lz4")
+    val parCollection = sc.makeRDD(1 to 4)
+    val flatMappedRDD = parCollection.flatMap(x => 1 to x)
+    checkpoint(flatMappedRDD, reliableCheckpoint)
+    assert(flatMappedRDD.dependencies.head.rdd === parCollection)
+    val result = flatMappedRDD.collect()
+    assert(flatMappedRDD.dependencies.head.rdd != parCollection)
+    assert(flatMappedRDD.collect() === result)
   }
 
   runTest("checkpointing partitioners", skipLocalCheckpoint = true) { _: Boolean =>
@@ -386,15 +314,13 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
       }
     }
 
-    startSparkContext()
     testPartitionerCheckpointing(partitioner)
 
     // Test that corrupted partitioner file does not prevent recovery of RDD
     testPartitionerCheckpointing(partitioner, corruptPartitionerFile = true)
   }
 
   runTest("RDDs with one-to-one dependencies") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     testRDD(_.map(x => x.toString), reliableCheckpoint)
     testRDD(_.flatMap(x => 1 to x), reliableCheckpoint)
     testRDD(_.filter(_ % 2 == 0), reliableCheckpoint)
@@ -408,7 +334,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("ParallelCollectionRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     val parCollection = sc.makeRDD(1 to 4, 2)
     val numPartitions = parCollection.partitions.size
     checkpoint(parCollection, reliableCheckpoint)
@@ -425,7 +350,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("BlockRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     val blockId = TestBlockId("id")
     val blockManager = SparkEnv.get.blockManager
     blockManager.putSingle(blockId, "test", StorageLevel.MEMORY_ONLY)
@@ -443,22 +367,19 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("ShuffleRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     testRDD(rdd => {
       // Creating ShuffledRDD directly as PairRDDFunctions.combineByKey produces a MapPartitionedRDD
       new ShuffledRDD[Int, Int, Int](rdd.map(x => (x % 2, 1)), partitioner)
     }, reliableCheckpoint)
   }
 
   runTest("UnionRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     def otherRDD: RDD[Int] = sc.makeRDD(1 to 10, 1)
     testRDD(_.union(otherRDD), reliableCheckpoint)
     testRDDPartitions(_.union(otherRDD), reliableCheckpoint)
   }
 
   runTest("CartesianRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     def otherRDD: RDD[Int] = sc.makeRDD(1 to 10, 1)
     testRDD(new CartesianRDD(sc, _, otherRDD), reliableCheckpoint)
     testRDDPartitions(new CartesianRDD(sc, _, otherRDD), reliableCheckpoint)
@@ -482,7 +403,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("CoalescedRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     testRDD(_.coalesce(2), reliableCheckpoint)
     testRDDPartitions(_.coalesce(2), reliableCheckpoint)
 
@@ -505,7 +425,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("CoGroupedRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     val longLineageRDD1 = generateFatPairRDD()
 
     // Collect the RDD as sequences instead of arrays to enable equality tests in testRDD
@@ -524,7 +443,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("ZippedPartitionsRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     testRDD(rdd => rdd.zip(rdd.map(x => x)), reliableCheckpoint)
     testRDDPartitions(rdd => rdd.zip(rdd.map(x => x)), reliableCheckpoint)
 
@@ -550,7 +468,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("PartitionerAwareUnionRDD") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     testRDD(rdd => {
       new PartitionerAwareUnionRDD[(Int, Int)](sc, Array(
         generateFatPairRDD(),
@@ -585,7 +502,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("CheckpointRDD with zero partitions") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     val rdd = new BlockRDD[Int](sc, Array.empty[BlockId])
     assert(rdd.partitions.size === 0)
     assert(rdd.isCheckpointed === false)
@@ -600,7 +516,6 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("checkpointAllMarkedAncestors") { reliableCheckpoint: Boolean =>
-    startSparkContext()
     testCheckpointAllMarkedAncestors(reliableCheckpoint, checkpointAllMarkedAncestors = true)
     testCheckpointAllMarkedAncestors(reliableCheckpoint, checkpointAllMarkedAncestors = false)
   }
@@ -667,3 +582,42 @@ object CheckpointSuite {
     ).asInstanceOf[RDD[(K, Array[Iterable[V]])]]
   }
 }
+
+class CheckpointCompressionSuite extends SparkFunSuite with LocalSparkContext {
+
+  test("checkpoint compression") {
+    val checkpointDir = Utils.createTempDir()
+    try {
+      val conf = new SparkConf()
+        .set("spark.checkpoint.compress", "true")
+        .set("spark.ui.enabled", "false")
+      sc = new SparkContext("local", "test", conf)
+      sc.setCheckpointDir(checkpointDir.toString)
+      val rdd = sc.makeRDD(1 to 20, numSlices = 1)
+      rdd.checkpoint()
+      assert(rdd.collect().toSeq === (1 to 20))
+
+      // Verify that RDD is checkpointed
+      assert(rdd.firstParent.isInstanceOf[ReliableCheckpointRDD[_]])
+
+      val checkpointPath = new Path(rdd.getCheckpointFile.get)
+      val fs = checkpointPath.getFileSystem(sc.hadoopConfiguration)
+      val checkpointFile =
+        fs.listStatus(checkpointPath).map(_.getPath).find(_.getName.startsWith("part-")).get
+
+      // Verify the checkpoint file is compressed, in other words, can be decompressed
+      val compressedInputStream = CompressionCodec.createCodec(conf)
+        .compressedInputStream(fs.open(checkpointFile))
+      try {
+        ByteStreams.toByteArray(compressedInputStream)
+      } finally {
+        compressedInputStream.close()
+      }
+
+      // Verify that the compressed content can be read back
+      assert(rdd.collect().toSeq === (1 to 20))
+    } finally {
+      Utils.deleteRecursively(checkpointDir)
+    }
+  }
+}