@@ -22,7 +22,9 @@ import scala.reflect.ClassTag
2222
2323import  org .scalatest .FunSuite 
2424
25+ import  org .apache .commons .math3 .distribution .BinomialDistribution 
2526import  org .apache .commons .math3 .distribution .PoissonDistribution 
27+ 
2628import  org .apache .spark ._ 
2729import  org .apache .spark .SparkContext ._ 
2830import  org .apache .spark .rdd ._ 
@@ -496,29 +498,25 @@ class RDDSuite extends FunSuite with SharedSparkContext {
496498  }
497499
498500  test(" computeFraction"  ) {
499-     //  test that the computed fraction guarantees enough datapoints in the sample with a failure rate <= 0.0001
501+     //  test that the computed fraction guarantees enough datapoints
502+     //  in the sample with a failure rate <= 0.0001
500503    val  data  =  new  EmptyRDD [Int ](sc)
501504    val  n  =  100000 
502505
503506    for  (s <-  1  to 15 ) {
504507      val  frac  =  data.computeFraction(s, n, true )
505-       val  qpois  =  new  PoissonDistribution (frac *  n)
506-       assert(qpois .inverseCumulativeProbability(0.0001 ) >=  s, " Computed fraction is too low"  )
508+       val  poisson  =  new  PoissonDistribution (frac *  n)
509+       assert(poisson .inverseCumulativeProbability(0.0001 ) >=  s, " Computed fraction is too low"  )
507510    }
508-     for  (s <-  1  to 15 ) {
509-       val  frac  =  data.computeFraction(s, n, false )
510-       val  qpois  =  new  PoissonDistribution (frac *  n)
511-       assert(qpois.inverseCumulativeProbability(0.0001 ) >=  s, " Computed fraction is too low"  )
512-     }
513-     for  (s <-  List (1 , 10 , 100 , 1000 )) {
511+     for  (s <-  List (20 , 100 , 1000 )) {
514512      val  frac  =  data.computeFraction(s, n, true )
515-       val  qpois  =  new  PoissonDistribution (frac *  n)
516-       assert(qpois .inverseCumulativeProbability(0.0001 ) >=  s, " Computed fraction is too low"  )
513+       val  poisson  =  new  PoissonDistribution (frac *  n)
514+       assert(poisson .inverseCumulativeProbability(0.0001 ) >=  s, " Computed fraction is too low"  )
517515    }
518516    for  (s <-  List (1 , 10 , 100 , 1000 )) {
519517      val  frac  =  data.computeFraction(s, n, false )
520-       val  qpois  =  new  PoissonDistribution (frac  *  n )
521-       assert(qpois .inverseCumulativeProbability(0.0001 ) >=  s, " Computed fraction is too low"  )
518+       val  binomial  =  new  BinomialDistribution (n, frac )
519+       assert(binomial .inverseCumulativeProbability(0.0001 )* n  >=  s, " Computed fraction is too low"  )
522520    }
523521  }
524522
@@ -530,37 +528,37 @@ class RDDSuite extends FunSuite with SharedSparkContext {
530528      val  sample  =  data.takeSample(withReplacement= false , num= num)
531529      assert(sample.size ===  num)        //  Got exactly num elements
532530      assert(sample.toSet.size ===  num)  //  Elements are distinct
533-       assert(sample.forall(x =>  1  <=  x &&  x <=  n), " elements not in [1, 100 ]"  )
531+       assert(sample.forall(x =>  1  <=  x &&  x <=  n), s " elements not in [1,  $n ] " )
534532    }
535533    for  (seed <-  1  to 5 ) {
536534      val  sample  =  data.takeSample(withReplacement= false , 20 , seed)
537535      assert(sample.size ===  20 )        //  Got exactly 20 elements
538536      assert(sample.toSet.size ===  20 )  //  Elements are distinct
539-       assert(sample.forall(x =>  1  <=  x &&  x <=  n), " elements not in [1, 100 ]"  )
537+       assert(sample.forall(x =>  1  <=  x &&  x <=  n), s " elements not in [1,  $n ] " )
540538    }
541539    for  (seed <-  1  to 5 ) {
542540      val  sample  =  data.takeSample(withReplacement= false , 100 , seed)
543541      assert(sample.size ===  100 )        //  Got only 100 elements
544542      assert(sample.toSet.size ===  100 )  //  Elements are distinct
545-       assert(sample.forall(x =>  1  <=  x &&  x <=  n), " elements not in [1, 100 ]"  )
543+       assert(sample.forall(x =>  1  <=  x &&  x <=  n), s " elements not in [1,  $n ] " )
546544    }
547545    for  (seed <-  1  to 5 ) {
548546      val  sample  =  data.takeSample(withReplacement= true , 20 , seed)
549547      assert(sample.size ===  20 )        //  Got exactly 20 elements
550-       assert(sample.forall(x =>  1  <=  x &&  x <=  n), " elements not in [1, 100 ]"  )
548+       assert(sample.forall(x =>  1  <=  x &&  x <=  n), s " elements not in [1,  $n ] " )
551549    }
552550    {
553551      val  sample  =  data.takeSample(withReplacement= true , num= 20 )
554552      assert(sample.size ===  20 )        //  Got exactly 100 elements
555553      assert(sample.toSet.size <=  20 , " sampling with replacement returned all distinct elements"  )
556-       assert(sample.forall(x =>  1  <=  x &&  x <=  n), " elements not in [1, 100 ]"  )
554+       assert(sample.forall(x =>  1  <=  x &&  x <=  n), s " elements not in [1,  $n ] " )
557555    }
558556    {
559557      val  sample  =  data.takeSample(withReplacement= true , num= n)
560558      assert(sample.size ===  n)        //  Got exactly 100 elements
561559      //  Chance of getting all distinct elements is astronomically low, so test we got < 100
562560      assert(sample.toSet.size <  n, " sampling with replacement returned all distinct elements"  )
563-       assert(sample.forall(x =>  1  <=  x &&  x <=  n), " elements not in [1, 100 ]"  )
561+       assert(sample.forall(x =>  1  <=  x &&  x <=  n), s " elements not in [1,  $n ] " )
564562    }
565563    for  (seed <-  1  to 5 ) {
566564      val  sample  =  data.takeSample(withReplacement= true , n, seed)
0 commit comments