From 6d67dfc1d4c012492b97873beaac5a7cbfd6f55a Mon Sep 17 00:00:00 2001
From: Ala Luszczak <ala@databricks.com>
Date: Fri, 23 Feb 2018 15:37:19 +0100
Subject: [PATCH 1/2] Fix SPARK-23496.

---
 .../org/apache/spark/rdd/CoalescedRDD.scala   |  6 +--
 .../scala/org/apache/spark/rdd/RDDSuite.scala | 43 +++++++++++++++++++
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 10451a324b0f..5a24afb36cb8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -266,17 +266,15 @@ private class DefaultPartitionCoalescer(val balanceSlack: Double = 0.10)
         numCreated += 1
       }
     }
-    tries = 0
     // if we don't have enough partition groups, create duplicates
     while (numCreated < targetLen) {
-      val (nxt_replica, nxt_part) = partitionLocs.partsWithLocs(tries)
-      tries += 1
+      val (nxt_replica, nxt_part) = partitionLocs.partsWithLocs(
+        rnd.nextInt(partitionLocs.partsWithLocs.length))
       val pgroup = new PartitionGroup(Some(nxt_replica))
       groupArr += pgroup
       groupHash.getOrElseUpdate(nxt_replica, ArrayBuffer()) += pgroup
       addPartToPGroup(nxt_part, pgroup)
       numCreated += 1
-      if (tries >= partitionLocs.partsWithLocs.length) tries = 0
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index e994d724c462..717a404742a7 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -1129,6 +1129,36 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
     }.collect()
   }
 
+  test("SPARK-23496: order of input partitions can result in severe skew in coalesce") {
+    val numInputPartitions = 100
+    val numCoalescedPartitions = 50
+    val locations = Array("locA", "locB")
+
+    val inputRDD = sc.makeRDD(Range(0, numInputPartitions).toArray[Int], numInputPartitions)
+    assert(inputRDD.getNumPartitions == numInputPartitions)
+
+    val locationPrefRDD = new LocationPrefRDD(inputRDD, { (p: Partition) =>
+      if (p.index < numCoalescedPartitions) {
+        Seq(locations(0))
+      } else {
+        Seq(locations(1))
+      }
+    })
+    val coalescedRDD = new CoalescedRDD(locationPrefRDD, numCoalescedPartitions)
+
+    val numPartsPerLocation = coalescedRDD
+      .getPartitions
+      .map(coalescedRDD.getPreferredLocations(_).head)
+      .groupBy(identity)
+      .mapValues(_.size)
+
+    // Without the fix these would be:
+    //  numPartsPerLocation(locations(0)) == numCoalescedPartitions - 1
+    //  numPartsPerLocation(locations(1)) == 1
+    assert(numPartsPerLocation(locations(0)) > 0.4 * numCoalescedPartitions)
+    assert(numPartsPerLocation(locations(1)) > 0.4 * numCoalescedPartitions)
+  }
+
   // NOTE
   // Below tests calling sc.stop() have to be the last tests in this suite. If there are tests
   // running after them and if they access sc those tests will fail as sc is already closed, because
@@ -1210,3 +1240,16 @@ class SizeBasedCoalescer(val maxSize: Int) extends PartitionCoalescer with Seria
     groups.toArray
   }
 }
+
+/** Alters the preferred locations of the parent RDD using provided function. */
+class LocationPrefRDD[T: ClassTag](
+    @transient var prev: RDD[T],
+    val locationPicker: Partition => Seq[String]) extends RDD[T](prev) {
+  override protected def getPartitions: Array[Partition] = prev.partitions
+
+  override def compute(partition: Partition, context: TaskContext): Iterator[T] =
+    null.asInstanceOf[Iterator[T]]
+
+  override def getPreferredLocations(partition: Partition): Seq[String] =
+    locationPicker(partition)
+}

From 051273651cd65b9eca568b37c79b50342a7f69c2 Mon Sep 17 00:00:00 2001
From: Ala Luszczak <ala@databricks.com>
Date: Mon, 26 Feb 2018 15:41:24 +0100
Subject: [PATCH 2/2] Apply review remarks.

---
 core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala | 2 ++
 core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala     | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 5a24afb36cb8..94e7d0b38cba 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -268,6 +268,8 @@ private class DefaultPartitionCoalescer(val balanceSlack: Double = 0.10)
     }
     // if we don't have enough partition groups, create duplicates
     while (numCreated < targetLen) {
+      // Copy the preferred location from a random input partition.
+      // This helps in avoiding skew when the input partitions are clustered by preferred location.
       val (nxt_replica, nxt_part) = partitionLocs.partsWithLocs(
         rnd.nextInt(partitionLocs.partsWithLocs.length))
       val pgroup = new PartitionGroup(Some(nxt_replica))
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index 717a404742a7..191c61250ce2 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -1152,9 +1152,8 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
       .groupBy(identity)
       .mapValues(_.size)
 
-    // Without the fix these would be:
-    //  numPartsPerLocation(locations(0)) == numCoalescedPartitions - 1
-    //  numPartsPerLocation(locations(1)) == 1
+    // Make sure the coalesced partitions are distributed fairly evenly between the two locations.
+    // This should not become flaky since the DefaultPartitionsCoalescer uses a fixed seed.
     assert(numPartsPerLocation(locations(0)) > 0.4 * numCoalescedPartitions)
     assert(numPartsPerLocation(locations(1)) > 0.4 * numCoalescedPartitions)
   }