@@ -22,7 +22,9 @@ import scala.reflect.ClassTag
2222
2323import org .scalatest .FunSuite
2424
25+ import org .apache .commons .math3 .distribution .BinomialDistribution
2526import org .apache .commons .math3 .distribution .PoissonDistribution
27+
2628import org .apache .spark ._
2729import org .apache .spark .SparkContext ._
2830import org .apache .spark .rdd ._
@@ -496,29 +498,25 @@ class RDDSuite extends FunSuite with SharedSparkContext {
496498 }
497499
498500 test(" computeFraction" ) {
499- // test that the computed fraction guarantees enough datapoints in the sample with a failure rate <= 0.0001
501+ // test that the computed fraction guarantees enough datapoints
502+ // in the sample with a failure rate <= 0.0001
500503 val data = new EmptyRDD [Int ](sc)
501504 val n = 100000
502505
503506 for (s <- 1 to 15 ) {
504507 val frac = data.computeFraction(s, n, true )
505- val qpois = new PoissonDistribution (frac * n)
506- assert(qpois .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
508+ val poisson = new PoissonDistribution (frac * n)
509+ assert(poisson .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
507510 }
508- for (s <- 1 to 15 ) {
509- val frac = data.computeFraction(s, n, false )
510- val qpois = new PoissonDistribution (frac * n)
511- assert(qpois.inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
512- }
513- for (s <- List (1 , 10 , 100 , 1000 )) {
511+ for (s <- List (20 , 100 , 1000 )) {
514512 val frac = data.computeFraction(s, n, true )
515- val qpois = new PoissonDistribution (frac * n)
516- assert(qpois .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
513+ val poisson = new PoissonDistribution (frac * n)
514+ assert(poisson .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
517515 }
518516 for (s <- List (1 , 10 , 100 , 1000 )) {
519517 val frac = data.computeFraction(s, n, false )
520- val qpois = new PoissonDistribution (frac * n )
521- assert(qpois .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
518+ val binomial = new BinomialDistribution (n, frac )
519+ assert(binomial .inverseCumulativeProbability(0.0001 )* n >= s, " Computed fraction is too low" )
522520 }
523521 }
524522
@@ -530,37 +528,37 @@ class RDDSuite extends FunSuite with SharedSparkContext {
530528 val sample = data.takeSample(withReplacement= false , num= num)
531529 assert(sample.size === num) // Got exactly num elements
532530 assert(sample.toSet.size === num) // Elements are distinct
533- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
531+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
534532 }
535533 for (seed <- 1 to 5 ) {
536534 val sample = data.takeSample(withReplacement= false , 20 , seed)
537535 assert(sample.size === 20 ) // Got exactly 20 elements
538536 assert(sample.toSet.size === 20 ) // Elements are distinct
539- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
537+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
540538 }
541539 for (seed <- 1 to 5 ) {
542540 val sample = data.takeSample(withReplacement= false , 100 , seed)
543541 assert(sample.size === 100 ) // Got only 100 elements
544542 assert(sample.toSet.size === 100 ) // Elements are distinct
545- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
543+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
546544 }
547545 for (seed <- 1 to 5 ) {
548546 val sample = data.takeSample(withReplacement= true , 20 , seed)
549547 assert(sample.size === 20 ) // Got exactly 20 elements
550- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
548+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
551549 }
552550 {
553551 val sample = data.takeSample(withReplacement= true , num= 20 )
554552 assert(sample.size === 20 ) // Got exactly 100 elements
555553 assert(sample.toSet.size <= 20 , " sampling with replacement returned all distinct elements" )
556- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
554+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
557555 }
558556 {
559557 val sample = data.takeSample(withReplacement= true , num= n)
560558 assert(sample.size === n) // Got exactly 100 elements
561559 // Chance of getting all distinct elements is astronomically low, so test we got < 100
562560 assert(sample.toSet.size < n, " sampling with replacement returned all distinct elements" )
563- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
561+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
564562 }
565563 for (seed <- 1 to 5 ) {
566564 val sample = data.takeSample(withReplacement= true , n, seed)
0 commit comments