Skip to content

Commit e97e342

Browse files
committed
add countApproxDistinct()
1 parent b21ae5b commit e97e342

File tree

3 files changed

+40
-4
lines changed

3 files changed

+40
-4
lines changed

core/src/main/scala/org/apache/spark/rdd/RDD.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -993,7 +993,7 @@ abstract class RDD[T: ClassTag](
993993
*/
994994
@Experimental
995995
def countApproxDistinct(p: Int, sp: Int): Long = {
996-
require(p >= 4, s"p ($p) must be greater than 0")
996+
require(p >= 4, s"p ($p) must be at least 4")
997997
require(sp <= 32, s"sp ($sp) cannot be greater than 32")
998998
require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)")
999999
val zeroCounter = new HyperLogLogPlus(p, sp)

python/pyspark/rdd.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,7 +1864,7 @@ def _is_pickled(self):
18641864
return True
18651865
return False
18661866

1867-
def _to_jrdd(self):
1867+
def _to_java_object_rdd(self):
18681868
""" Return an JavaRDD of Object by unpickling
18691869
18701870
It will convert each Python object into Java object by Pyrolite, whenever the
@@ -1899,7 +1899,7 @@ def sumApprox(self, timeout, confidence=0.95):
18991899
>>> (rdd.sumApprox(1000) - r) / r < 0.05
19001900
True
19011901
"""
1902-
jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_jrdd()
1902+
jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd()
19031903
jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())
19041904
r = jdrdd.sumApprox(timeout, confidence).getFinalValue()
19051905
return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high())
@@ -1915,11 +1915,38 @@ def meanApprox(self, timeout, confidence=0.95):
19151915
>>> (rdd.meanApprox(1000) - r) / r < 0.05
19161916
True
19171917
"""
1918-
jrdd = self.map(float)._to_jrdd()
1918+
jrdd = self.map(float)._to_java_object_rdd()
19191919
jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())
19201920
r = jdrdd.meanApprox(timeout, confidence).getFinalValue()
19211921
return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high())
19221922

1923+
def countApproxDistinct(self, relativeSD=0.05):
1924+
"""
1925+
:: Experimental ::
1926+
Return approximate number of distinct elements in the RDD.
1927+
1928+
The algorithm used is based on streamlib's implementation of
1929+
"HyperLogLog in Practice: Algorithmic Engineering of a State
1930+
of The Art Cardinality Estimation Algorithm", available
1931+
<a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
1932+
1933+
This support all the types of objects, which is supported by
1934+
Pyrolite, nearly all builtin types.
1935+
1936+
@param relativeSD Relative accuracy. Smaller values create
1937+
counters that require more space.
1938+
It must be greater than 0.000017.
1939+
1940+
>>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct()
1941+
>>> 950 < n < 1050
1942+
True
1943+
"""
1944+
if relativeSD < 0.000017:
1945+
raise ValueError("relativeSD should be greater than 0.000017")
1946+
if relativeSD > 0.37:
1947+
raise ValueError("relativeSD should be smaller than 0.37")
1948+
return self._to_java_object_rdd().countApproxDistinct(relativeSD)
1949+
19231950

19241951
class PipelinedRDD(RDD):
19251952

python/pyspark/tests.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,15 @@ def test_zip_with_different_number_of_items(self):
364364
self.assertEquals(a.count(), b.count())
365365
self.assertRaises(Exception, lambda: a.zip(b).count())
366366

367+
def test_count_approx_distinct(self):
368+
rdd = self.sc.parallelize(range(1000))
369+
self.assertTrue(950 < rdd.countApproxDistinct(0.04) < 1050)
370+
self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.04) < 1050)
371+
self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.04) < 1050)
372+
self.assertTrue(950 < rdd.map(lambda x: (x, -x)).countApproxDistinct(0.04) < 1050)
373+
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
374+
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))
375+
367376

368377
class TestIO(PySparkTestCase):
369378

0 commit comments

Comments
 (0)