1818import sys
1919import random
2020
21+
2122class RDDSampler (object ):
2223 def __init__ (self , withReplacement , fraction , seed = None ):
2324 try :
2425 import numpy
2526 self ._use_numpy = True
2627 except ImportError :
27- print >> sys .stderr , "NumPy does not appear to be installed. Falling back to default random generator for sampling."
28+ print >> sys .stderr , (
29+ "NumPy does not appear to be installed. "
30+ "Falling back to default random generator for sampling." )
2831 self ._use_numpy = False
2932
3033 self ._seed = seed if seed is not None else random .randint (0 , sys .maxint )
@@ -61,7 +64,7 @@ def getUniformSample(self, split):
6164 def getPoissonSample (self , split , mean ):
6265 if not self ._rand_initialized or split != self ._split :
6366 self .initRandomGenerator (split )
64-
67+
6568 if self ._use_numpy :
6669 return self ._random .poisson (mean )
6770 else :
@@ -80,30 +83,27 @@ def getPoissonSample(self, split, mean):
8083 num_arrivals += 1
8184
8285 return (num_arrivals - 1 )
83-
86+
8487 def shuffle (self , vals ):
8588 if self ._random is None :
8689 self .initRandomGenerator (0 ) # this should only ever called on the master so
8790 # the split does not matter
88-
91+
8992 if self ._use_numpy :
9093 self ._random .shuffle (vals )
9194 else :
9295 self ._random .shuffle (vals , self ._random .random )
9396
9497 def func (self , split , iterator ):
95- if self ._withReplacement :
98+ if self ._withReplacement :
9699 for obj in iterator :
97- # For large datasets, the expected number of occurrences of each element in a sample with
98- # replacement is Poisson(frac). We use that to get a count for each element.
99- count = self .getPoissonSample (split , mean = self ._fraction )
100+ # For large datasets, the expected number of occurrences of each element in
101+ # a sample with replacement is Poisson(frac). We use that to get a count for
102+ # each element.
103+ count = self .getPoissonSample (split , mean = self ._fraction )
100104 for _ in range (0 , count ):
101105 yield obj
102106 else :
103107 for obj in iterator :
104108 if self .getUniformSample (split ) <= self ._fraction :
105109 yield obj
106-
107-
108-
109-
0 commit comments