Skip to content

Commit e20da47

Browse files
committed
remove the correction in Python
1 parent c38c4e4 commit e20da47

File tree

1 file changed

+3
-9
lines changed

1 file changed

+3
-9
lines changed

python/pyspark/rdd.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2023,15 +2023,9 @@ def countApproxDistinct(self, relativeSD=0.05):
20232023
raise ValueError("relativeSD should be greater than 0.000017")
20242024
if relativeSD > 0.37:
20252025
raise ValueError("relativeSD should be smaller than 0.37")
2026-
hashRDD = self.map(lambda x: portable_hash(x) % sys.maxint)
2027-
c = hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
2028-
# range of hash is [0, sys.maxint]
2029-
if c > sys.maxint / 30:
2030-
# correction for hash collision in Python,
2031-
# hash collision probability is 1 - exp(-X), so X = - log(1 - p)
2032-
# see http://preshing.com/20110504/hash-collision-probabilities/
2033-
c = - sys.maxint * log(1 - float(c) / sys.maxint)
2034-
return int(c)
2026+
# the hash space in Java is 2^32
2027+
hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF)
2028+
return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
20352029

20362030

20372031
class PipelinedRDD(RDD):

0 commit comments

Comments
 (0)