1717
1818from collections import defaultdict
1919from itertools import chain , ifilter , imap
20- import time
2120import operator
2221
2322from pyspark .serializers import NoOpSerializer ,\
2423 BatchedSerializer , CloudPickleSerializer , pack_long
2524from pyspark .rdd import _JavaStackTrace
25+ from pyspark .storagelevel import StorageLevel
26+ from pyspark .resultiterable import ResultIterable
2627
2728from py4j .java_collections import ListConverter , MapConverter
2829
@@ -35,6 +36,8 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
3536 self ._ssc = ssc
3637 self .ctx = ssc ._sc
3738 self ._jrdd_deserializer = jrdd_deserializer
39+ self .is_cached = False
40+ self .is_checkpointed = False
3841
3942 def context (self ):
4043 """
@@ -247,8 +250,6 @@ def takeAndPrint(rdd, time):
247250 taken = rdd .take (11 )
248251 print "-------------------------------------------"
249252 print "Time: %s" % (str (time ))
250- print rdd .glom ().collect ()
251- print "-------------------------------------------"
252253 print "-------------------------------------------"
253254 for record in taken [:10 ]:
254255 print record
@@ -303,32 +304,65 @@ def get_output(rdd, time):
303304
304305 self .foreachRDD (get_output )
305306
306- def _test_switch_dserializer (self , serializer_que ):
307+ def cache (self ):
308+ """
309+ Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
310+ """
311+ self .is_cached = True
312+ self .persist (StorageLevel .MEMORY_ONLY_SER )
313+ return self
314+
315+ def persist (self , storageLevel ):
316+ """
317+ Set this DStream's storage level to persist its values across operations
318+ after the first time it is computed. This can only be used to assign
319+ a new storage level if the DStream does not have a storage level set yet.
320+ """
321+ self .is_cached = True
322+ javaStorageLevel = self .ctx ._getJavaStorageLevel (storageLevel )
323+ self ._jdstream .persist (javaStorageLevel )
324+ return self
325+
326+ def checkpoint (self , interval ):
307327 """
308- Deserializer is dynamically changed based on numSlice and the number of
309- input. This function choose deserializer. Currently this is just FIFO.
328+ Mark this DStream for checkpointing. It will be saved to a file inside the
329+ checkpoint directory set with L{SparkContext.setCheckpointDir()}
330+
331+ I am not sure this part in DStream
332+ and
333+ all references to its parent RDDs will be removed. This function must
334+ be called before any job has been executed on this RDD. It is strongly
335+ recommended that this RDD is persisted in memory, otherwise saving it
336+ on a file will require recomputation.
337+
338+ interval must be pysprak.streaming.duration
310339 """
311-
312- jrdd_deserializer = self ._jrdd_deserializer
340+ self .is_checkpointed = True
341+ self ._jdstream .checkpoint (interval )
342+ return self
343+
344+ def groupByKey (self , numPartitions = None ):
345+ def createCombiner (x ):
346+ return [x ]
313347
314- def switch (rdd , jtime ):
315- try :
316- print serializer_que
317- jrdd_deserializer = serializer_que .pop (0 )
318- print jrdd_deserializer
319- except Exception as e :
320- print e
348+ def mergeValue (xs , x ):
349+ xs .append (x )
350+ return xs
321351
322- self .foreachRDD (switch )
352+ def mergeCombiners (a , b ):
353+ a .extend (b )
354+ return a
323355
356+ return self .combineByKey (createCombiner , mergeValue , mergeCombiners ,
357+ numPartitions ).mapValues (lambda x : ResultIterable (x ))
324358
325359
326360# TODO: implement groupByKey
361+ # TODO: implement saveAsTextFile
362+
363+ # Following operation has dependency to transform
327364# TODO: impelment union
328- # TODO: implement cache
329- # TODO: implement persist
330365# TODO: implement repertitions
331- # TODO: implement saveAsTextFile
332366# TODO: implement cogroup
333367# TODO: implement join
334368# TODO: implement countByValue
@@ -355,6 +389,7 @@ def pipeline_func(split, iterator):
355389 self ._prev_jdstream = prev ._prev_jdstream # maintain the pipeline
356390 self ._prev_jrdd_deserializer = prev ._prev_jrdd_deserializer
357391 self .is_cached = False
392+ self .is_checkpointed = False
358393 self ._ssc = prev ._ssc
359394 self .ctx = prev .ctx
360395 self .prev = prev
@@ -391,4 +426,4 @@ def _jdstream(self):
391426 return self ._jdstream_val
392427
393428 def _is_pipelinable (self ):
394- return not self .is_cached
429+ return not ( self .is_cached or self . is_checkpointed )
0 commit comments