Skip to content

Commit 4c7cde2

Browse files
committed
allow specifying a random seed in ALS
1 parent 200bef0 commit 4c7cde2

File tree

2 files changed

+90
-19
lines changed
  • mllib/src
    • main/scala/org/apache/spark/mllib/recommendation
    • test/scala/org/apache/spark/mllib/recommendation

2 files changed

+90
-19
lines changed

mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala

Lines changed: 77 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,15 @@ case class Rating(val user: Int, val product: Int, val rating: Double)
8989
* indicated user
9090
* preferences rather than explicit ratings given to items.
9191
*/
92-
class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var lambda: Double,
93-
var implicitPrefs: Boolean, var alpha: Double)
94-
extends Serializable with Logging
95-
{
92+
class ALS private (
93+
var numBlocks: Int,
94+
var rank: Int,
95+
var iterations: Int,
96+
var lambda: Double,
97+
var implicitPrefs: Boolean,
98+
var alpha: Double,
99+
var seed: Long = System.nanoTime()
100+
) extends Serializable with Logging {
96101
def this() = this(-1, 10, 10, 0.01, false, 1.0)
97102

98103
/**
@@ -132,6 +137,11 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
132137
this
133138
}
134139

140+
def setSeed(seed: Long): ALS = {
141+
this.seed = seed
142+
this
143+
}
144+
135145
/**
136146
* Run ALS with the configured parameters on an input RDD of (user, product, rating) triples.
137147
* Returns a MatrixFactorizationModel with feature vectors for each user and product.
@@ -155,7 +165,7 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
155165

156166
// Initialize user and product factors randomly, but use a deterministic seed for each
157167
// partition so that fault recovery works
158-
val seedGen = new Random()
168+
val seedGen = new Random(seed)
159169
val seed1 = seedGen.nextInt()
160170
val seed2 = seedGen.nextInt()
161171
// Hash an integer to propagate random bits at all positions, similar to java.util.HashTable
@@ -468,6 +478,7 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
468478
* Top-level methods for calling Alternating Least Squares (ALS) matrix factorization.
469479
*/
470480
object ALS {
481+
471482
/**
472483
* Train a matrix factorization model given an RDD of ratings given by users to some products,
473484
* in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
@@ -480,15 +491,39 @@ object ALS {
480491
* @param iterations number of iterations of ALS (recommended: 10-20)
481492
* @param lambda regularization factor (recommended: 0.01)
482493
* @param blocks level of parallelism to split computation into
494+
* @param seed random seed
483495
*/
484496
def train(
485497
ratings: RDD[Rating],
486498
rank: Int,
487499
iterations: Int,
488500
lambda: Double,
489-
blocks: Int)
490-
: MatrixFactorizationModel =
491-
{
501+
blocks: Int,
502+
seed: Long
503+
): MatrixFactorizationModel = {
504+
new ALS(blocks, rank, iterations, lambda, false, 1.0, seed).run(ratings)
505+
}
506+
507+
/**
508+
* Train a matrix factorization model given an RDD of ratings given by users to some products,
509+
* in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
510+
* product of two lower-rank matrices of a given rank (number of features). To solve for these
511+
* features, we run a given number of iterations of ALS. This is done using a level of
512+
* parallelism given by `blocks`.
513+
*
514+
* @param ratings RDD of (userID, productID, rating) pairs
515+
* @param rank number of features to use
516+
* @param iterations number of iterations of ALS (recommended: 10-20)
517+
* @param lambda regularization factor (recommended: 0.01)
518+
* @param blocks level of parallelism to split computation into
519+
*/
520+
def train(
521+
ratings: RDD[Rating],
522+
rank: Int,
523+
iterations: Int,
524+
lambda: Double,
525+
blocks: Int
526+
): MatrixFactorizationModel = {
492527
new ALS(blocks, rank, iterations, lambda, false, 1.0).run(ratings)
493528
}
494529

@@ -505,8 +540,7 @@ object ALS {
505540
* @param lambda regularization factor (recommended: 0.01)
506541
*/
507542
def train(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double)
508-
: MatrixFactorizationModel =
509-
{
543+
: MatrixFactorizationModel = {
510544
train(ratings, rank, iterations, lambda, -1)
511545
}
512546

@@ -522,8 +556,7 @@ object ALS {
522556
* @param iterations number of iterations of ALS (recommended: 10-20)
523557
*/
524558
def train(ratings: RDD[Rating], rank: Int, iterations: Int)
525-
: MatrixFactorizationModel =
526-
{
559+
: MatrixFactorizationModel = {
527560
train(ratings, rank, iterations, 0.01, -1)
528561
}
529562

@@ -540,16 +573,42 @@ object ALS {
540573
* @param lambda regularization factor (recommended: 0.01)
541574
* @param blocks level of parallelism to split computation into
542575
* @param alpha confidence parameter (only applies when immplicitPrefs = true)
576+
* @param seed random seed
543577
*/
544578
def trainImplicit(
545579
ratings: RDD[Rating],
546580
rank: Int,
547581
iterations: Int,
548582
lambda: Double,
549583
blocks: Int,
550-
alpha: Double)
551-
: MatrixFactorizationModel =
552-
{
584+
alpha: Double,
585+
seed: Long
586+
): MatrixFactorizationModel = {
587+
new ALS(blocks, rank, iterations, lambda, true, alpha, seed).run(ratings)
588+
}
589+
590+
/**
591+
* Train a matrix factorization model given an RDD of 'implicit preferences' given by users
592+
* to some products, in the form of (userID, productID, preference) pairs. We approximate the
593+
* ratings matrix as the product of two lower-rank matrices of a given rank (number of features).
594+
* To solve for these features, we run a given number of iterations of ALS. This is done using
595+
* a level of parallelism given by `blocks`.
596+
*
597+
* @param ratings RDD of (userID, productID, rating) pairs
598+
* @param rank number of features to use
599+
* @param iterations number of iterations of ALS (recommended: 10-20)
600+
* @param lambda regularization factor (recommended: 0.01)
601+
* @param blocks level of parallelism to split computation into
602+
* @param alpha confidence parameter (only applies when immplicitPrefs = true)
603+
*/
604+
def trainImplicit(
605+
ratings: RDD[Rating],
606+
rank: Int,
607+
iterations: Int,
608+
lambda: Double,
609+
blocks: Int,
610+
alpha: Double
611+
): MatrixFactorizationModel = {
553612
new ALS(blocks, rank, iterations, lambda, true, alpha).run(ratings)
554613
}
555614

@@ -565,8 +624,8 @@ object ALS {
565624
* @param iterations number of iterations of ALS (recommended: 10-20)
566625
* @param lambda regularization factor (recommended: 0.01)
567626
*/
568-
def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double,
569-
alpha: Double): MatrixFactorizationModel = {
627+
def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double, alpha: Double)
628+
: MatrixFactorizationModel = {
570629
trainImplicit(ratings, rank, iterations, lambda, -1, alpha)
571630
}
572631

@@ -583,8 +642,7 @@ object ALS {
583642
* @param iterations number of iterations of ALS (recommended: 10-20)
584643
*/
585644
def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int)
586-
: MatrixFactorizationModel =
587-
{
645+
: MatrixFactorizationModel = {
588646
trainImplicit(ratings, rank, iterations, 0.01, -1, 1.0)
589647
}
590648

mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import org.scalatest.FunSuite
2525

2626
import org.jblas._
2727

28+
import org.apache.spark.SparkContext._
2829
import org.apache.spark.mllib.util.LocalSparkContext
2930

3031
object ALSSuite {
@@ -115,6 +116,18 @@ class ALSSuite extends FunSuite with LocalSparkContext {
115116
testALS(100, 200, 2, 15, 0.7, 0.4, true, false, true)
116117
}
117118

119+
test("pseudorandomness") {
120+
val ratings = sc.parallelize(ALSSuite.generateRatings(10, 20, 5, 0.5, false, false)._1, 2)
121+
val model11 = ALS.train(ratings, 5, 1, 1.0, 2, 1)
122+
val model12 = ALS.train(ratings, 5, 1, 1.0, 2, 1)
123+
val u11 = model11.userFeatures.values.flatMap(_.toList).collect().toList
124+
val u12 = model12.userFeatures.values.flatMap(_.toList).collect().toList
125+
val model2 = ALS.train(ratings, 5, 1, 1.0, 2, 2)
126+
val u2 = model2.userFeatures.values.flatMap(_.toList).collect().toList
127+
assert(u11 == u12)
128+
assert(u11 != u2)
129+
}
130+
118131
/**
119132
* Test if we can correctly factorize R = U * P where U and P are of known rank.
120133
*

0 commit comments

Comments
 (0)