File tree Expand file tree Collapse file tree 1 file changed +3
-9
lines changed Expand file tree Collapse file tree 1 file changed +3
-9
lines changed Original file line number Diff line number Diff line change @@ -2023,15 +2023,9 @@ def countApproxDistinct(self, relativeSD=0.05):
20232023 raise ValueError ("relativeSD should be greater than 0.000017" )
20242024 if relativeSD > 0.37 :
20252025 raise ValueError ("relativeSD should be smaller than 0.37" )
2026- hashRDD = self .map (lambda x : portable_hash (x ) % sys .maxint )
2027- c = hashRDD ._to_java_object_rdd ().countApproxDistinct (relativeSD )
2028- # range of hash is [0, sys.maxint]
2029- if c > sys .maxint / 30 :
2030- # correction for hash collision in Python,
2031- # hash collision probability is 1 - exp(-X), so X = - log(1 - p)
2032- # see http://preshing.com/20110504/hash-collision-probabilities/
2033- c = - sys .maxint * log (1 - float (c ) / sys .maxint )
2034- return int (c )
2026+ # the hash space in Java is 2^32
2027+ hashRDD = self .map (lambda x : portable_hash (x ) & 0xFFFFFFFF )
2028+ return hashRDD ._to_java_object_rdd ().countApproxDistinct (relativeSD )
20352029
20362030
20372031class PipelinedRDD (RDD ):
You can’t perform that action at this time.
0 commit comments