1717
1818from  collections  import  defaultdict 
1919from  itertools  import  chain , ifilter , imap 
20- import  time 
2120import  operator 
2221
2322from  pyspark .serializers  import  NoOpSerializer ,\
2423    BatchedSerializer , CloudPickleSerializer , pack_long 
2524from  pyspark .rdd  import  _JavaStackTrace 
25+ from  pyspark .storagelevel  import  StorageLevel 
26+ from  pyspark .resultiterable  import  ResultIterable 
2627
2728from  py4j .java_collections  import  ListConverter , MapConverter 
2829
@@ -35,6 +36,8 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
3536        self ._ssc  =  ssc 
3637        self .ctx  =  ssc ._sc 
3738        self ._jrdd_deserializer  =  jrdd_deserializer 
39+         self .is_cached  =  False 
40+         self .is_checkpointed  =  False 
3841
3942    def  context (self ):
4043        """ 
@@ -247,8 +250,6 @@ def takeAndPrint(rdd, time):
247250            taken  =  rdd .take (11 )
248251            print  "-------------------------------------------" 
249252            print  "Time: %s"  %  (str (time ))
250-             print  rdd .glom ().collect ()
251-             print  "-------------------------------------------" 
252253            print  "-------------------------------------------" 
253254            for  record  in  taken [:10 ]:
254255                print  record 
@@ -303,32 +304,65 @@ def get_output(rdd, time):
303304
304305        self .foreachRDD (get_output )
305306
306-     def  _test_switch_dserializer (self , serializer_que ):
307+     def  cache (self ):
308+         """ 
309+         Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}). 
310+         """ 
311+         self .is_cached  =  True 
312+         self .persist (StorageLevel .MEMORY_ONLY_SER )
313+         return  self 
314+ 
315+     def  persist (self , storageLevel ):
316+         """ 
317+         Set this DStream's storage level to persist its values across operations 
318+         after the first time it is computed. This can only be used to assign 
319+         a new storage level if the DStream does not have a storage level set yet. 
320+         """ 
321+         self .is_cached  =  True 
322+         javaStorageLevel  =  self .ctx ._getJavaStorageLevel (storageLevel )
323+         self ._jdstream .persist (javaStorageLevel )
324+         return  self 
325+ 
326+     def  checkpoint (self , interval ):
307327        """ 
308-         Deserializer is dynamically changed based on numSlice and the number of 
309-         input. This function choose deserializer. Currently this is just FIFO. 
328+         Mark this DStream for checkpointing. It will be saved to a file inside the 
329+         checkpoint directory set with L{SparkContext.setCheckpointDir()} 
330+ 
331+         I am not sure this part in DStream 
332+         and 
333+         all references to its parent RDDs will be removed. This function must 
334+         be called before any job has been executed on this RDD. It is strongly 
335+         recommended that this RDD is persisted in memory, otherwise saving it 
336+         on a file will require recomputation. 
337+ 
338+         interval must be pysprak.streaming.duration 
310339        """ 
311-         
312-         jrdd_deserializer  =  self ._jrdd_deserializer 
340+         self .is_checkpointed  =  True 
341+         self ._jdstream .checkpoint (interval )
342+         return  self 
343+ 
344+     def  groupByKey (self , numPartitions = None ):
345+         def  createCombiner (x ):
346+             return  [x ]
313347
314-         def  switch (rdd , jtime ):
315-             try :
316-                 print  serializer_que 
317-                 jrdd_deserializer  =  serializer_que .pop (0 )
318-                 print  jrdd_deserializer 
319-             except  Exception  as  e :
320-                 print  e 
348+         def  mergeValue (xs , x ):
349+             xs .append (x )
350+             return  xs 
321351
322-         self .foreachRDD (switch )
352+         def  mergeCombiners (a , b ):
353+             a .extend (b )
354+             return  a 
323355
356+         return  self .combineByKey (createCombiner , mergeValue , mergeCombiners ,
357+                                  numPartitions ).mapValues (lambda  x : ResultIterable (x ))
324358
325359
326360# TODO: implement groupByKey 
361+ # TODO: implement saveAsTextFile 
362+ 
363+ # Following operation has dependency to transform 
327364# TODO: impelment union 
328- # TODO: implement cache 
329- # TODO: implement persist 
330365# TODO: implement repertitions 
331- # TODO: implement saveAsTextFile 
332366# TODO: implement cogroup 
333367# TODO: implement join 
334368# TODO: implement countByValue 
@@ -355,6 +389,7 @@ def pipeline_func(split, iterator):
355389            self ._prev_jdstream  =  prev ._prev_jdstream   # maintain the pipeline 
356390            self ._prev_jrdd_deserializer  =  prev ._prev_jrdd_deserializer 
357391        self .is_cached  =  False 
392+         self .is_checkpointed  =  False 
358393        self ._ssc  =  prev ._ssc 
359394        self .ctx  =  prev .ctx 
360395        self .prev  =  prev 
@@ -391,4 +426,4 @@ def _jdstream(self):
391426        return  self ._jdstream_val 
392427
393428    def  _is_pipelinable (self ):
394-         return  not  self .is_cached 
429+         return  not  ( self .is_cached   or   self . is_checkpointed ) 
0 commit comments