@@ -201,7 +201,7 @@ def _defaultReducePartitions(self):
201201 """
202202 Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
203203 If spark.default.parallelism is set, then we'll use the value from SparkContext
204- defaultParallelism, otherwise we'll use the number of partitions in this RDD.
204+ defaultParallelism, otherwise we'll use the number of partitions in this RDD
205205
206206 This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
207207 the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
@@ -216,7 +216,8 @@ def getNumPartitions(self):
216216 """
217217 Return the number of partitions in RDD
218218 """
219- # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
219+ # TODO: remove hardcoding. RDD has NumPartitions. How do we get the number of partition
220+ # through DStream?
220221 return 2
221222
222223 def foreachRDD (self , func ):
@@ -236,6 +237,10 @@ def pyprint(self):
236237 operator, so this DStream will be registered as an output stream and there materialized.
237238 """
238239 def takeAndPrint (rdd , time ):
240+ """
241+ Closure to take element from RDD and print first 10 elements.
242+ This closure is called by py4j callback server.
243+ """
239244 taken = rdd .take (11 )
240245 print "-------------------------------------------"
241246 print "Time: %s" % (str (time ))
@@ -300,17 +305,11 @@ def checkpoint(self, interval):
300305 Mark this DStream for checkpointing. It will be saved to a file inside the
301306 checkpoint directory set with L{SparkContext.setCheckpointDir()}
302307
303- I am not sure this part in DStream
304- and
305- all references to its parent RDDs will be removed. This function must
306- be called before any job has been executed on this RDD. It is strongly
307- recommended that this RDD is persisted in memory, otherwise saving it
308- on a file will require recomputation.
309-
310- interval must be pysprak.streaming.duration
308+ @param interval: Time interval after which generated RDD will be checkpointed
309+ interval has to be pyspark.streaming.duration.Duration
311310 """
312311 self .is_checkpointed = True
313- self ._jdstream .checkpoint (interval )
312+ self ._jdstream .checkpoint (interval . _jduration )
314313 return self
315314
316315 def groupByKey (self , numPartitions = None ):
@@ -363,6 +362,10 @@ def saveAsTextFiles(self, prefix, suffix=None):
363362 """
364363
365364 def saveAsTextFile (rdd , time ):
365+ """
366+ Closure to save element in RDD in DStream as Pickled data in file.
367+ This closure is called by py4j callback server.
368+ """
366369 path = rddToFileName (prefix , suffix , time )
367370 rdd .saveAsTextFile (path )
368371
@@ -376,6 +379,10 @@ def saveAsPickleFiles(self, prefix, suffix=None):
376379 """
377380
378381 def saveAsPickleFile (rdd , time ):
382+ """
383+ Closure to save element in RDD in the DStream as Pickled data in file.
384+ This closure is called by py4j callback server.
385+ """
379386 path = rddToFileName (prefix , suffix , time )
380387 rdd .saveAsPickleFile (path )
381388
@@ -404,9 +411,10 @@ def get_output(rdd, time):
404411# TODO: implement countByWindow
405412# TODO: implement reduceByWindow
406413
407- # Following operation has dependency to transform
414+ # transform Operation
408415# TODO: implement transform
409416# TODO: implement transformWith
417+ # Following operation has dependency with transform
410418# TODO: implement union
411419# TODO: implement repertitions
412420# TODO: implement cogroup
0 commit comments