Make partitionBy use a tweaked version of hash as its default partition function

Erik Selin · Erik Selin · commit 201c30101cef · 2014-04-09T17:06:33.000-04:00
since the python hash function does not consistently assign the same value
to None across python processes.
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -913,7 +913,7 @@ def rightOuterJoin(self, other, numPartitions=None):
         return python_right_outer_join(self, other, numPartitions)
 
     # TODO: add option to control map-side combining
-    def partitionBy(self, numPartitions, partitionFunc=hash):
+    def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the RDD partitioned using the specified partitioner.
 
@@ -924,6 +924,9 @@ def partitionBy(self, numPartitions, partitionFunc=hash):
         """
         if numPartitions is None:
             numPartitions = self.ctx.defaultParallelism
+
+        if partitionFunc is None:
+            partitionFunc = lambda x: 0 if x is None else hash(x)
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
         # to Java.  Each object is a (splitNumber, [objects]) pair.