[SPARK-4304] [PySpark] Fix sort on empty RDD (1.0 branch)

Davies Liu · JoshRosen · commit d4aed266d3db · 2014-11-07T20:57:56.000-08:00
This PR fix sortBy()/sortByKey() on empty RDD. This should be back ported into 1.0 Author: Davies Liu <davies@databricks.com> Closes #3163 from davies/fix_sort_1.0 and squashes the following commits: 9be984f [Davies Liu] fix sort on empty RDD
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -496,6 +496,8 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc = lambda x: x):
         # number of (key, value) pairs falling into them
         if numPartitions > 1:
             rddSize = self.count()
+            if not rddSize:
+                return self
             maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner
             fraction = min(maxSampleSize / max(rddSize, 1), 1.0)
 
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
@@ -198,6 +198,9 @@ def test_deleting_input_files(self):
         os.unlink(tempFile.name)
         self.assertRaises(Exception, lambda: filtered_data.count())
 
+    def test_sort_on_empty_rdd(self):
+        self.assertEqual([], self.sc.parallelize(zip([], [])).sortByKey().collect())
+
     def test_itemgetter(self):
         rdd = self.sc.parallelize([range(10)])
         from operator import itemgetter