Skip to content

Commit 37a7dea

Browse files
committed
cleaned up boundaries for histogram -- uses real min/max when buckets are derived
1 parent 29981f2 commit 37a7dea

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

python/pyspark/rdd.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ def getBuckets(self, bucketCount):
623623
if increment != 0:
624624
buckets = range(min,max, increment)
625625

626-
return buckets
626+
return {"min":min, "max":max, "buckets":buckets}
627627

628628
def histogram(self, bucketCount, buckets=None):
629629
"""
@@ -633,10 +633,15 @@ def histogram(self, bucketCount, buckets=None):
633633
>>> sc.parallelize([1,49, 23, 100, 12, 13, 20, 22, 75, 50]).histogram(3)
634634
defaultdict(<type 'int'>, {(67, inf): 2, (1, 33): 6, (34, 66): 2})
635635
"""
636-
636+
min = float("-inf")
637+
max = float("inf")
637638
evenBuckets = False
638639
if not buckets:
639-
buckets = self.getBuckets(bucketCount)
640+
b = self.getBuckets(bucketCount)
641+
buckets = b["buckets"]
642+
min = b["min"]
643+
max = b["max"]
644+
640645
if len(buckets) < 2:
641646
raise ValueError("requires more than 1 bucket")
642647
if len(buckets) % 2 == 0:
@@ -649,9 +654,9 @@ def histogramPartition(iterator):
649654
if k < len(buckets) and k > 0:
650655
key = (buckets[k-1], buckets[k]-1)
651656
elif k == len(buckets):
652-
key = (buckets[k-1], float("inf"))
657+
key = (buckets[k-1], max)
653658
elif k == 0:
654-
key = (float("-inf"), buckets[k]-1)
659+
key = (min, buckets[k]-1)
655660
counters[key] += 1
656661
yield counters
657662

0 commit comments

Comments
 (0)