@@ -207,7 +207,7 @@ def _defaultReducePartitions(self):
207207 """
208208 Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
209209 If spark.default.parallelism is set, then we'll use the value from SparkContext
210- defaultParallelism, otherwise we'll use the number of partitions in this RDD.
210+ defaultParallelism, otherwise we'll use the number of partitions in this RDD
211211
212212 This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
213213 the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
@@ -222,7 +222,8 @@ def getNumPartitions(self):
222222 """
223223 Return the number of partitions in RDD
224224 """
225- # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
225+ # TODO: remove hardcoding. RDD has NumPartitions. How do we get the number of partition
226+ # through DStream?
226227 return 2
227228
228229 def foreachRDD (self , func ):
@@ -243,6 +244,10 @@ def pyprint(self):
243244 operator, so this DStream will be registered as an output stream and there materialized.
244245 """
245246 def takeAndPrint (rdd , time ):
247+ """
248+ Closure to take element from RDD and print first 10 elements.
249+ This closure is called by py4j callback server.
250+ """
246251 taken = rdd .take (11 )
247252 print "-------------------------------------------"
248253 print "Time: %s" % (str (time ))
@@ -307,17 +312,11 @@ def checkpoint(self, interval):
307312 Mark this DStream for checkpointing. It will be saved to a file inside the
308313 checkpoint directory set with L{SparkContext.setCheckpointDir()}
309314
310- I am not sure this part in DStream
311- and
312- all references to its parent RDDs will be removed. This function must
313- be called before any job has been executed on this RDD. It is strongly
314- recommended that this RDD is persisted in memory, otherwise saving it
315- on a file will require recomputation.
316-
317- interval must be pysprak.streaming.duration
315+ @param interval: Time interval after which generated RDD will be checkpointed
316+ interval has to be pyspark.streaming.duration.Duration
318317 """
319318 self .is_checkpointed = True
320- self ._jdstream .checkpoint (interval )
319+ self ._jdstream .checkpoint (interval . _jduration )
321320 return self
322321
323322 def groupByKey (self , numPartitions = None ):
@@ -369,6 +368,10 @@ def saveAsTextFiles(self, prefix, suffix=None):
369368 Save this DStream as a text file, using string representations of elements.
370369 """
371370 def saveAsTextFile (rdd , time ):
371+ """
372+ Closure to save element in RDD in DStream as Pickled data in file.
373+ This closure is called by py4j callback server.
374+ """
372375 path = rddToFileName (prefix , suffix , time )
373376 rdd .saveAsTextFile (path )
374377
@@ -410,9 +413,10 @@ def get_output(rdd, time):
410413# TODO: implement countByWindow
411414# TODO: implement reduceByWindow
412415
413- # Following operation has dependency to transform
416+ # transform Operation
414417# TODO: implement transform
415418# TODO: implement transformWith
419+ # Following operation has dependency with transform
416420# TODO: implement union
417421# TODO: implement repertitions
418422# TODO: implement cogroup
0 commit comments