support windowed dstream

davies · davies · commit d357b70cc2fd · 2014-09-26T12:35:02.000-07:00
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
@@ -22,8 +22,8 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.util import rddToFileName, RDDFunction, RDDFunction2
 from pyspark.rdd import portable_hash
-from pyspark.streaming.duration import Seconds
-
+from pyspark.streaming.duration import Duration, Seconds
+from pyspark.resultiterable import ResultIterable
 
 __all__ = ["DStream"]
 
@@ -299,13 +299,17 @@ def get_output(rdd, time):
         return result
 
     def transform(self, func):
-        return TransformedRDD(self, lambda a, t: func(a), True)
+        return TransformedDStream(self, lambda a, t: func(a), True)
 
     def transformWithTime(self, func):
-        return TransformedRDD(self, func, False)
+        return TransformedDStream(self, func, False)
 
     def transformWith(self, func, other, keepSerializer=False):
-        return Transformed2RDD(self, lambda a, b, t: func(a, b), other, keepSerializer)
+        jfunc = RDDFunction2(self.ctx, func, self._jrdd_deserializer)
+        dstream = self.ctx._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
+                                                          other._jdstream.dstream(), jfunc)
+        jrdd_serializer = self._jrdd_deserializer if keepSerializer else self.ctx.serializer
+        return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
 
     def repartitions(self, numPartitions):
         return self.transform(lambda rdd: rdd.repartition(numPartitions))
@@ -336,28 +340,60 @@ def window(self, windowDuration, slideDuration=None):
         s = Seconds(slideDuration)
         return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer)
 
-    def reduceByWindow(self, reduceFunc, inReduceFunc, windowDuration, slideDuration):
-        pass
-
-    def countByWindow(self, window, slide):
-        pass
-
-    def countByValueAndWindow(self, window, slide, numPartitions=None):
-        pass
-
-    def groupByKeyAndWindow(self, window, slide, numPartitions=None):
-        pass
-
-    def reduceByKeyAndWindow(self, reduceFunc, inReduceFunc, window, slide, numPartitions=None):
-        pass
+    def reduceByWindow(self, reduceFunc, invReduceFunc, windowDuration, slideDuration):
+        keyed = self.map(lambda x: (1, x))
+        reduced = keyed.reduceByKeyAndWindow(reduceFunc, invReduceFunc,
+                                             windowDuration, slideDuration, 1)
+        return reduced.map(lambda (k, v): v)
+
+    def countByWindow(self, windowDuration, slideDuration):
+        return self.map(lambda x: 1).reduceByWindow(operator.add, operator.sub,
+                                                    windowDuration, slideDuration)
+
+    def countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=None):
+        keyed = self.map(lambda x: (x, 1))
+        counted = keyed.reduceByKeyAndWindow(lambda a, b: a + b, lambda a, b: a - b,
+                                             windowDuration, slideDuration, numPartitions)
+        return counted.filter(lambda (k, v): v > 0).count()
+
+    def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None):
+        ls = self.mapValues(lambda x: [x])
+        grouped = ls.reduceByKeyAndWindow(lambda a, b: a.extend(b) or a, lambda a, b: a[len(b):],
+                                          windowDuration, slideDuration, numPartitions)
+        return grouped.mapValues(ResultIterable)
+
+    def reduceByKeyAndWindow(self, func, invFunc,
+                             windowDuration, slideDuration, numPartitions=None):
+        reduced = self.reduceByKey(func)
+
+        def reduceFunc(a, t):
+            return a.reduceByKey(func, numPartitions)
+
+        def invReduceFunc(a, b, t):
+            b = b.reduceByKey(func, numPartitions)
+            joined = a.leftOuterJoin(b, numPartitions)
+            return joined.mapValues(lambda (v1, v2): invFunc(v1, v2) if v2 is not None else v1)
+
+        if not isinstance(windowDuration, Duration):
+            windowDuration = Seconds(windowDuration)
+        if not isinstance(slideDuration, Duration):
+            slideDuration = Seconds(slideDuration)
+        serializer = reduced._jrdd_deserializer
+        jreduceFunc = RDDFunction(self.ctx, reduceFunc, reduced._jrdd_deserializer)
+        jinvReduceFunc = RDDFunction2(self.ctx, invReduceFunc, reduced._jrdd_deserializer)
+        dstream = self.ctx._jvm.PythonReducedWindowedDStream(reduced._jdstream.dstream(),
+                                                             jreduceFunc, jinvReduceFunc,
+                                                             windowDuration._jduration,
+                                                             slideDuration._jduration)
+        return DStream(dstream.asJavaDStream(), self._ssc, serializer)
 
     def updateStateByKey(self, updateFunc):
         # FIXME: convert updateFunc to java JFunction2
         jFunc = updateFunc
         return self._jdstream.updateStateByKey(jFunc)
 
 
-class TransformedRDD(DStream):
+class TransformedDStream(DStream):
     def __init__(self, prev, func, reuse=False):
         ssc = prev._ssc
         self._ssc = ssc
@@ -366,7 +402,8 @@ def __init__(self, prev, func, reuse=False):
         self.is_cached = False
         self.is_checkpointed = False
 
-        if isinstance(prev, TransformedRDD) and not prev.is_cached and not prev.is_checkpointed:
+        if (isinstance(prev, TransformedDStream) and
+                not prev.is_cached and not prev.is_checkpointed):
             prev_func = prev.func
             old_func = func
             func = lambda rdd, t: old_func(prev_func(rdd, t), t)
@@ -388,13 +425,3 @@ def _jdstream(self):
                                                           jfunc, self.reuse).asJavaDStream()
         self._jdstream_val = jdstream
         return jdstream
-
-
-class Transformed2RDD(DStream):
-    def __init__(self, prev, func, other, keepSerializer=False):
-        ssc = prev._ssc
-        jfunc = RDDFunction2(ssc._sc, func, prev._jrdd_deserializer)
-        jdstream = ssc._jvm.PythonTransformed2DStream(prev._jdstream.dstream(),
-                                                      other._jdstream.dstream(), jfunc)
-        jrdd_serializer = prev._jrdd_deserializer if keepSerializer else ssc._sc.serializer
-        DStream.__init__(self, jdstream.asJavaDStream(), ssc, jrdd_serializer)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
@@ -33,42 +33,64 @@
 
 
 class PySparkStreamingTestCase(unittest.TestCase):
+
+    timeout = 10  # seconds
+
     def setUp(self):
         class_name = self.__class__.__name__
         self.sc = SparkContext(appName=class_name)
+        self.sc.setCheckpointDir("/tmp")
         self.ssc = StreamingContext(self.sc, duration=Seconds(1))
 
     def tearDown(self):
-        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
-        # we do not wait to shutdown py4j client.
         self.ssc.stop()
         self.sc.stop()
-        time.sleep(1)
 
     @classmethod
     def tearDownClass(cls):
         # Make sure tp shutdown the callback server
         SparkContext._gateway._shutdown_callback_server()
 
+    def _test_func(self, input, func, expected, numSlices=None, sort=False):
+        """
+        Start stream and return the result.
+        @param input: dataset for the test. This should be list of lists.
+        @param func: wrapped function. This function should return PythonDStream object.
+        @param expected: expected output for this testcase.
+        @param numSlices: the number of slices in the rdd in the dstream.
+        """
+        # Generate input stream with user-defined input.
+        input_stream = self.ssc._makeStream(input, numSlices)
+        # Apply test function to stream.
+        stream = func(input_stream)
+        result = stream.collect()
+        self.ssc.start()
 
-class TestBasicOperations(PySparkStreamingTestCase):
-    """
-    2 tests for each function for batach deserializer and unbatch deserilizer because
-    the deserializer is not changed dunamically after streaming process starts.
-    Default numInputPartitions is 2.
-    If the number of input element is over 3, that DStream use batach deserializer.
-    If not, that DStream use unbatch deserializer.
-
-    All tests input should have list of lists(3 lists are default). This list represents stream.
-    Every batch interval, the first object of list are chosen to make DStream.
-    e.g The first list in the list is input of the first batch.
-    Please see the BasicTestSuits in Scala which is close to this implementation.
-    """
-    def setUp(self):
-        PySparkStreamingTestCase.setUp(self)
-        self.timeout = 10  # seconds
-        self.numInputPartitions = 2
+        start_time = time.time()
+        # Loop until get the expected the number of the result from the stream.
+        while True:
+            current_time = time.time()
+            # Check time out.
+            if (current_time - start_time) > self.timeout:
+                break
+            # StreamingContext.awaitTermination is not used to wait because
+            # if py4j server is called every 50 milliseconds, it gets an error.
+            time.sleep(0.05)
+            # Check if the output is the same length of expected output.
+            if len(expected) == len(result):
+                break
+        if sort:
+            self._sort_result_based_on_key(result)
+            self._sort_result_based_on_key(expected)
+        self.assertEqual(expected, result)
 
+    def _sort_result_based_on_key(self, outputs):
+        """Sort the list based on first value."""
+        for output in outputs:
+            output.sort(key=lambda x: x[0])
+
+
+class TestBasicOperations(PySparkStreamingTestCase):
     def test_map(self):
         """Basic operation test for DStream.map."""
         input = [range(1, 5), range(5, 9), range(9, 13)]
@@ -239,54 +261,41 @@ def test_union(self):
                 break
         self.assertEqual(expected, result)
 
-    def _sort_result_based_on_key(self, outputs):
-        """Sort the list base onf first value."""
-        for output in outputs:
-            output.sort(key=lambda x: x[0])
 
-    def _test_func(self, input, func, expected, numSlices=None, sort=False):
-        """
-        Start stream and return the result.
-        @param input: dataset for the test. This should be list of lists.
-        @param func: wrapped function. This function should return PythonDStream object.
-        @param expected: expected output for this testcase.
-        @param numSlices: the number of slices in the rdd in the dstream.
-        """
-        # Generate input stream with user-defined input.
-        numSlices = numSlices or self.numInputPartitions
-        input_stream = self.ssc._makeStream(input, numSlices)
-        # Apply test function to stream.
-        stream = func(input_stream)
-        result = stream.collect()
-        self.ssc.start()
+class TestWindowFunctions(PySparkStreamingTestCase):
 
-        start_time = time.time()
-        # Loop until get the expected the number of the result from the stream.
-        while True:
-            current_time = time.time()
-            # Check time out.
-            if (current_time - start_time) > self.timeout:
-                break
-            # StreamingContext.awaitTermination is not used to wait because
-            # if py4j server is called every 50 milliseconds, it gets an error.
-            time.sleep(0.05)
-            # Check if the output is the same length of expected output.
-            if len(expected) == len(result):
-                break
-        if sort:
-            self._sort_result_based_on_key(result)
-            self._sort_result_based_on_key(expected)
-        self.assertEqual(expected, result)
+    timeout = 15
 
+    def test_count_by_window(self):
+        input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
-class TestStreamingContext(unittest.TestCase):
-    """
-    Should we have conf property in  SparkContext?
-    @property
-    def conf(self):
-        return self._conf
+        def func(dstream):
+            return dstream.countByWindow(4, 1)
+
+        expected = [[1], [3], [6], [9], [12], [15], [11], [6]]
+        self._test_func(input, func, expected)
+
+    def test_count_by_window_large(self):
+        input = [range(1), range(2), range(3), range(4), range(5), range(6)]
 
-    """
+        def func(dstream):
+            return dstream.countByWindow(6, 1)
+
+        expected = [[1], [3], [6], [10], [15], [20], [18], [15], [11], [6]]
+        self._test_func(input, func, expected)
+
+    def test_group_by_key_and_window(self):
+        input = [[('a', i)] for i in range(5)]
+
+        def func(dstream):
+            return dstream.groupByKeyAndWindow(4, 1).mapValues(list)
+
+        expected = [[('a', [0])], [('a', [0, 1])], [('a', [0, 1, 2])], [('a', [1, 2, 3])],
+                    [('a', [2, 3, 4])], [('a', [3, 4])], [('a', [4])]]
+        self._test_func(input, func, expected)
+
+
+class TestStreamingContext(unittest.TestCase):
     def setUp(self):
         self.sc = SparkContext(master="local[2]", appName=self.__class__.__name__)
         self.batachDuration = Seconds(1)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala