@@ -1864,7 +1864,7 @@ def _is_pickled(self):
18641864 return True
18651865 return False
18661866
1867- def _to_jrdd (self ):
1867+ def _to_java_object_rdd (self ):
18681868 """ Return an JavaRDD of Object by unpickling
18691869
18701870 It will convert each Python object into Java object by Pyrolite, whenever the
@@ -1899,7 +1899,7 @@ def sumApprox(self, timeout, confidence=0.95):
18991899 >>> (rdd.sumApprox(1000) - r) / r < 0.05
19001900 True
19011901 """
1902- jrdd = self .mapPartitions (lambda it : [float (sum (it ))])._to_jrdd ()
1902+ jrdd = self .mapPartitions (lambda it : [float (sum (it ))])._to_java_object_rdd ()
19031903 jdrdd = self .ctx ._jvm .JavaDoubleRDD .fromRDD (jrdd .rdd ())
19041904 r = jdrdd .sumApprox (timeout , confidence ).getFinalValue ()
19051905 return BoundedFloat (r .mean (), r .confidence (), r .low (), r .high ())
@@ -1915,11 +1915,38 @@ def meanApprox(self, timeout, confidence=0.95):
19151915 >>> (rdd.meanApprox(1000) - r) / r < 0.05
19161916 True
19171917 """
1918- jrdd = self .map (float )._to_jrdd ()
1918+ jrdd = self .map (float )._to_java_object_rdd ()
19191919 jdrdd = self .ctx ._jvm .JavaDoubleRDD .fromRDD (jrdd .rdd ())
19201920 r = jdrdd .meanApprox (timeout , confidence ).getFinalValue ()
19211921 return BoundedFloat (r .mean (), r .confidence (), r .low (), r .high ())
19221922
1923+ def countApproxDistinct (self , relativeSD = 0.05 ):
1924+ """
1925+ :: Experimental ::
1926+ Return approximate number of distinct elements in the RDD.
1927+
1928+ The algorithm used is based on streamlib's implementation of
1929+ "HyperLogLog in Practice: Algorithmic Engineering of a State
1930+ of The Art Cardinality Estimation Algorithm", available
1931+ <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
1932+
1933+ This support all the types of objects, which is supported by
1934+ Pyrolite, nearly all builtin types.
1935+
1936+ @param relativeSD Relative accuracy. Smaller values create
1937+ counters that require more space.
1938+ It must be greater than 0.000017.
1939+
1940+ >>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct()
1941+ >>> 950 < n < 1050
1942+ True
1943+ """
1944+ if relativeSD < 0.000017 :
1945+ raise ValueError ("relativeSD should be greater than 0.000017" )
1946+ if relativeSD > 0.37 :
1947+ raise ValueError ("relativeSD should be smaller than 0.37" )
1948+ return self ._to_java_object_rdd ().countApproxDistinct (relativeSD )
1949+
19231950
19241951class PipelinedRDD (RDD ):
19251952
0 commit comments