Skip to content

Commit 4cba98f

Browse files
committed
add more tests
1 parent a85a8c6 commit 4cba98f

File tree

2 files changed

+11
-1
lines changed

2 files changed

+11
-1
lines changed

python/pyspark/rdd.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2018,6 +2018,9 @@ def countApproxDistinct(self, relativeSD=0.05):
20182018
>>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct()
20192019
>>> 950 < n < 1050
20202020
True
2021+
>>> n = self.sc.parallelize([i % 20 for i in range(1000)])
2022+
>>> 18 < n < 22
2023+
True
20212024
"""
20222025
if relativeSD < 0.000017:
20232026
raise ValueError("relativeSD should be greater than 0.000017")

python/pyspark/tests.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,14 @@ def test_count_approx_distinct(self):
409409
self.assertTrue(950 < rdd.countApproxDistinct(0.04) < 1050)
410410
self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.04) < 1050)
411411
self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.04) < 1050)
412-
self.assertTrue(950 < rdd.map(lambda x: (x, -x)).countApproxDistinct(0.04) < 1050)
412+
self.assertTrue(950 < rdd.map(lambda x: set([x, -x])).countApproxDistinct(0.04) < 1050)
413+
414+
rdd = self.sc.parallelize([i % 20 for i in range(1000)], 7)
415+
self.assertTrue(18 < rdd.countApproxDistinct() < 22)
416+
self.assertTrue(18 < rdd.map(float).countApproxDistinct() < 22)
417+
self.assertTrue(18 < rdd.map(str).countApproxDistinct() < 22)
418+
self.assertTrue(18 < rdd.map(lambda x: set([x, -x])).countApproxDistinct() < 22)
419+
413420
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
414421
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))
415422

0 commit comments

Comments
 (0)