added histogram method, added max and min to statscounter

dwmclary · dwmclary · commit 491601600a09 · 2014-03-10T23:58:32.000-07:00
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -24,6 +24,7 @@
 import sys
 import shlex
 import traceback
+from bisect import bisect_right
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
 from threading import Thread
@@ -534,6 +535,7 @@ def func(iterator):
         return reduce(op, vals, zeroValue)
 
     # TODO: aggregate
+        
 
     def sum(self):
         """
@@ -610,6 +612,60 @@ def sampleVariance(self):
         """
         return self.stats().sampleVariance()
 
+    def getBuckets(self, bucketCount):
+        """
+        Compute a histogram of the data using bucketCount number of buckets
+        evenly spaced between the min and max of the RDD.
+
+        >>> sc.parallelize([1,49, 23, 100, 75, 50]).histogram()
+        {(0,49):3, (50, 100):3}
+        """
+
+        #use the statscounter as a quick way of getting max and min
+        mm_stats = self.stats()
+        min = mm_stats.min()
+        max = mm_stats.max()
+
+        increment = (max-min)/bucketCount
+        buckets = range(min,min)
+        if increment != 0:
+            buckets = range(min,max, increment)
+
+        return buckets
+
+    def histogram(self, bucketCount, buckets=None):
+        evenBuckets = False
+        if not buckets:
+            buckets = self.getBuckets(bucketCount)
+        if len(buckets) < 2:
+            raise ValueError("requires more than 1 bucket")
+        if len(buckets) % 2 == 0:
+            evenBuckets = True
+        # histogram partition
+        def histogramPartition(iterator):
+            counters = defaultdict(int)
+            for obj in iterator:
+                k = bisect_right(buckets, obj)
+                if k < len(buckets) and k > 0:
+                    key = (buckets[k-1], buckets[k]-1)
+                elif k == len(buckets):
+                    key = (buckets[k-1], float("inf"))
+                elif k == 0:
+                    key = (float("-inf"), buckets[k]-1)
+                counters[key] += 1
+            yield counters
+            
+        # merge counters
+        def mergeCounters(d1, d2):
+            for k in d2.keys():
+                if k in d1:
+                    d1[k] += d2[k]
+            return d1
+        
+        #map partitions(histogram_partition(bucketFunction)).reduce(mergeCounters)
+        return self.mapPartitions(histogramPartition).reduce(mergeCounters)
+
+
     def countByValue(self):
         """
         Return the count of each unique value in this RDD as a dictionary of
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
@@ -26,7 +26,9 @@ def __init__(self, values=[]):
         self.n = 0L    # Running count of our values
         self.mu = 0.0  # Running mean of our values
         self.m2 = 0.0  # Running variance numerator (sum of (x - mean)^2)
-
+        self.max_v = float("-inf")
+        self.min_v = float("inf")
+        
         for v in values:
             self.merge(v)
             
@@ -36,6 +38,11 @@ def merge(self, value):
         self.n += 1
         self.mu += delta / self.n
         self.m2 += delta * (value - self.mu)
+        if self.max_v < value:
+            self.max_v = value
+        if self.min_v > value:
+            self.min_v = value
+            
         return self
 
     # Merge another StatCounter into this one, adding up the internal statistics.
@@ -49,7 +56,10 @@ def mergeStats(self, other):
             if self.n == 0:
                 self.mu = other.mu
                 self.m2 = other.m2
-                self.n = other.n       
+                self.n = other.n
+                self.max_v = other.max_v
+                self.min_v = other.min_v
+                
             elif other.n != 0:        
                 delta = other.mu - self.mu
                 if other.n * 10 < self.n:
@@ -58,6 +68,9 @@ def mergeStats(self, other):
                     self.mu = other.mu - (delta * self.n) / (self.n + other.n)
                 else:
                     self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
+                
+                    self.max_v = max(self.max_v, other.max_v)
+                    self.min_v = min(self.min_v, other.min_v)
         
                 self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
                 self.n += other.n
@@ -76,6 +89,12 @@ def mean(self):
     def sum(self):
         return self.n * self.mu
 
+    def min(self):
+        return self.min_v
+
+    def max(self):
+        return self.max_v
+    
     # Return the variance of the values.
     def variance(self):
         if self.n == 0: