Skip to content

Commit 4c25a54

Browse files
committed
Add s at the end and a couple other fixes
1 parent 9b0ba99 commit 4c25a54

File tree

5 files changed

+11
-50
lines changed

5 files changed

+11
-50
lines changed

core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -765,9 +765,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
765765
* This is more efficient than calling `repartition` and then sorting within each partition
766766
* because it can push the sorting down into the shuffle machinery.
767767
*/
768-
def repartitionAndSortWithinPartition(partitioner: Partitioner): JavaPairRDD[K, V] = {
768+
def repartitionAndSortWithinPartitions(partitioner: Partitioner): JavaPairRDD[K, V] = {
769769
val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
770-
repartitionAndSortWithinPartition(partitioner, comp)
770+
repartitionAndSortWithinPartitions(partitioner, comp)
771771
}
772772

773773
/**
@@ -777,11 +777,11 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
777777
* This is more efficient than calling `repartition` and then sorting within each partition
778778
* because it can push the sorting down into the shuffle machinery.
779779
*/
780-
def repartitionAndSortWithinPartition(partitioner: Partitioner, comp: Comparator[K])
780+
def repartitionAndSortWithinPartitions(partitioner: Partitioner, comp: Comparator[K])
781781
: JavaPairRDD[K, V] = {
782782
implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
783783
fromRDD(
784-
new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartition(partitioner))
784+
new OrderedRDDFunctions[K, V, (K, V)](rdd).repartitionAndSortWithinPartitions(partitioner))
785785
}
786786

787787
/**

core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,7 @@ class OrderedRDDFunctions[K : Ordering : ClassTag,
7272
* This is more efficient than calling `repartition` and then sorting within each partition
7373
* because it can push the sorting down into the shuffle machinery.
7474
*/
75-
def repartitionAndSortWithinPartition(partitioner: Partitioner)
76-
: RDD[(K, V)] = {
75+
def repartitionAndSortWithinPartitions(partitioner: Partitioner): RDD[(K, V)] = {
7776
new ShuffledRDD[K, V, V](self, partitioner).setKeyOrdering(ordering)
7877
}
7978

core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
690690
def getPartition(key: Any): Int = key.asInstanceOf[Int] % 2
691691
}
692692

693-
val repartitioned = data.repartitionAndSortWithinPartition(partitioner)
693+
val repartitioned = data.repartitionAndSortWithinPartitions(partitioner)
694694
val partitions = repartitioned.glom().collect()
695695
assert(partitions(0) === Seq((0, 5), (0, 8), (2, 6)))
696696
assert(partitions(1) === Seq((1, 3), (3, 8), (3, 8)))

python/pyspark/rdd.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -520,14 +520,14 @@ def __add__(self, other):
520520
raise TypeError
521521
return self.union(other)
522522

523-
def repartitionAndSortWithinPartition(self, ascending=True, numPartitions=None,
524-
partitionFunc=portable_hash, keyfunc=lambda x: x):
523+
def repartitionAndSortWithinPartitions(self, numPartitions=None, partitionFunc=portable_hash,
524+
ascending=True, keyfunc=lambda x: x):
525525
"""
526526
Repartition the RDD according to the given partitioner and, within each resulting partition,
527527
sort records by their keys.
528528
529529
>>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)])
530-
>>> rdd2 = rdd.repartitionAndSortWithinPartition(True, lambda x: x % 2, 2)
530+
>>> rdd2 = rdd.repartitionAndSortWithinPartitions(True, lambda x: x % 2, 2)
531531
>>> rdd2.glom().collect()
532532
[[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]]
533533
"""

python/pyspark/tests.py

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
from pyspark.files import SparkFiles
4444
from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, PickleSerializer
4545
from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
46-
from pyspark.sql import SQLContext, IntegerType
4746

4847
_have_scipy = False
4948
_have_numpy = False
@@ -425,22 +424,6 @@ def test_zip_with_different_number_of_items(self):
425424
self.assertEquals(a.count(), b.count())
426425
self.assertRaises(Exception, lambda: a.zip(b).count())
427426

428-
def test_count_approx_distinct(self):
429-
rdd = self.sc.parallelize(range(1000))
430-
self.assertTrue(950 < rdd.countApproxDistinct(0.04) < 1050)
431-
self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.04) < 1050)
432-
self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.04) < 1050)
433-
self.assertTrue(950 < rdd.map(lambda x: (x, -x)).countApproxDistinct(0.04) < 1050)
434-
435-
rdd = self.sc.parallelize([i % 20 for i in range(1000)], 7)
436-
self.assertTrue(18 < rdd.countApproxDistinct() < 22)
437-
self.assertTrue(18 < rdd.map(float).countApproxDistinct() < 22)
438-
self.assertTrue(18 < rdd.map(str).countApproxDistinct() < 22)
439-
self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22)
440-
441-
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
442-
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))
443-
444427
def test_histogram(self):
445428
# empty
446429
rdd = self.sc.parallelize([])
@@ -545,36 +528,15 @@ def test_histogram(self):
545528
self.assertEquals(([1, "b"], [5]), rdd.histogram(1))
546529
self.assertRaises(TypeError, lambda: rdd.histogram(2))
547530

548-
def test_repartitionAndSortWithinPartition(self):
531+
def test_repartitionAndSortWithinPartitions(self):
549532
rdd = self.sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)], 2)
550533

551-
repartitioned = rdd.repartitionAndSortWithinPartition(True, 2, lambda key: key % 2)
534+
repartitioned = rdd.repartitionAndSortWithinPartitions(2, lambda key: key % 2)
552535
partitions = repartitioned.glom().collect()
553536
self.assertEquals(partitions[0], [(0, 5), (0, 8), (2, 6)])
554537
self.assertEquals(partitions[1], [(1, 3), (3, 8), (3, 8)])
555538

556539

557-
class TestSQL(PySparkTestCase):
558-
559-
def setUp(self):
560-
PySparkTestCase.setUp(self)
561-
self.sqlCtx = SQLContext(self.sc)
562-
563-
def test_udf(self):
564-
self.sqlCtx.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType())
565-
[row] = self.sqlCtx.sql("SELECT twoArgs('test', 1)").collect()
566-
self.assertEqual(row[0], 5)
567-
568-
def test_broadcast_in_udf(self):
569-
bar = {"a": "aa", "b": "bb", "c": "abc"}
570-
foo = self.sc.broadcast(bar)
571-
self.sqlCtx.registerFunction("MYUDF", lambda x: foo.value[x] if x else '')
572-
[res] = self.sqlCtx.sql("SELECT MYUDF('c')").collect()
573-
self.assertEqual("abc", res[0])
574-
[res] = self.sqlCtx.sql("SELECT MYUDF('')").collect()
575-
self.assertEqual("", res[0])
576-
577-
578540
class TestIO(PySparkTestCase):
579541

580542
def test_stdout_redirection(self):

0 commit comments

Comments
 (0)