We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent d306492 commit 9d2565fCopy full SHA for 9d2565f
python/pyspark/rdd.py
@@ -2027,8 +2027,10 @@ def countApproxDistinct(self, relativeSD=0.05):
2027
c = hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
2028
# range of hash is [0, sys.maxint]
2029
if c > sys.maxint / 30:
2030
- # correction for hash collision in Python
2031
- c = -sys.maxint * log(1 - float(c) / sys.maxint)
+ # correction for hash collision in Python,
+ # hash collision probability is 1 - exp(-X), so X = - log(1 - p)
2032
+ # see http://preshing.com/20110504/hash-collision-probabilities/
2033
+ c = - sys.maxint * log(1 - float(c) / sys.maxint)
2034
return int(c)
2035
2036
0 commit comments