Skip to content

Commit 896d2e7

Browse files
jkbradleyjeanlyn
authored andcommitted
[SPARK-7421] [MLLIB] OnlineLDA cleanups
Small changes, primarily to allow us more flexibility in the future: * Rename "tau_0" to "tau0" * Mark LDAOptimizer trait sealed and DeveloperApi. * Mark LDAOptimizer subclasses as final. * Mark setOptimizer (the one taking an LDAOptimizer) and getOptimizer as DeveloperApi since we may need to change them in the future CC: hhbyyh Author: Joseph K. Bradley <[email protected]> Closes apache#5956 from jkbradley/onlinelda-cleanups and squashes the following commits: f4be508 [Joseph K. Bradley] added newline f4003e4 [Joseph K. Bradley] Changes: * Rename "tau_0" to "tau0" * Mark LDAOptimizer trait sealed and DeveloperApi. * Mark LDAOptimizer subclasses as final. * Mark setOptimizer (the one taking an LDAOptimizer) and getOptimizer as DeveloperApi since we may need to change them in the future
1 parent 3adecd4 commit 896d2e7

File tree

4 files changed

+34
-28
lines changed

4 files changed

+34
-28
lines changed

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818
package org.apache.spark.mllib.clustering
1919

2020
import breeze.linalg.{DenseVector => BDV}
21+
2122
import org.apache.spark.Logging
22-
import org.apache.spark.annotation.Experimental
23+
import org.apache.spark.annotation.{DeveloperApi, Experimental}
2324
import org.apache.spark.api.java.JavaPairRDD
2425
import org.apache.spark.graphx._
2526
import org.apache.spark.mllib.linalg.Vector
@@ -197,20 +198,28 @@ class LDA private (
197198
}
198199

199200

200-
/** LDAOptimizer used to perform the actual calculation */
201+
/**
202+
* :: DeveloperApi ::
203+
*
204+
* LDAOptimizer used to perform the actual calculation
205+
*/
206+
@DeveloperApi
201207
def getOptimizer: LDAOptimizer = ldaOptimizer
202208

203209
/**
210+
* :: DeveloperApi ::
211+
*
204212
* LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer)
205213
*/
214+
@DeveloperApi
206215
def setOptimizer(optimizer: LDAOptimizer): this.type = {
207216
this.ldaOptimizer = optimizer
208217
this
209218
}
210219

211220
/**
212221
* Set the LDAOptimizer used to perform the actual calculation by algorithm name.
213-
* Currently "em", "online" is supported.
222+
* Currently "em", "online" are supported.
214223
*/
215224
def setOptimizer(optimizerName: String): this.type = {
216225
this.ldaOptimizer =

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,21 @@ import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, sum, normalize, kr
2323
import breeze.numerics.{digamma, exp, abs}
2424
import breeze.stats.distributions.{Gamma, RandBasis}
2525

26-
import org.apache.spark.annotation.Experimental
26+
import org.apache.spark.annotation.DeveloperApi
2727
import org.apache.spark.graphx._
2828
import org.apache.spark.graphx.impl.GraphImpl
2929
import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer
3030
import org.apache.spark.mllib.linalg.{Matrices, SparseVector, DenseVector, Vector}
3131
import org.apache.spark.rdd.RDD
3232

3333
/**
34-
* :: Experimental ::
34+
* :: DeveloperApi ::
3535
*
3636
* An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can
3737
* hold optimizer-specific parameters for users to set.
3838
*/
39-
@Experimental
40-
trait LDAOptimizer {
39+
@DeveloperApi
40+
sealed trait LDAOptimizer {
4141

4242
/*
4343
DEVELOPERS NOTE:
@@ -59,7 +59,7 @@ trait LDAOptimizer {
5959
}
6060

6161
/**
62-
* :: Experimental ::
62+
* :: DeveloperApi ::
6363
*
6464
* Optimizer for EM algorithm which stores data + parameter graph, plus algorithm parameters.
6565
*
@@ -75,8 +75,8 @@ trait LDAOptimizer {
7575
* "On Smoothing and Inference for Topic Models." UAI, 2009.
7676
*
7777
*/
78-
@Experimental
79-
class EMLDAOptimizer extends LDAOptimizer {
78+
@DeveloperApi
79+
final class EMLDAOptimizer extends LDAOptimizer {
8080

8181
import LDA._
8282

@@ -211,7 +211,7 @@ class EMLDAOptimizer extends LDAOptimizer {
211211

212212

213213
/**
214-
* :: Experimental ::
214+
* :: DeveloperApi ::
215215
*
216216
* An online optimizer for LDA. The Optimizer implements the Online variational Bayes LDA
217217
* algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic
@@ -220,8 +220,8 @@ class EMLDAOptimizer extends LDAOptimizer {
220220
* Original Online LDA paper:
221221
* Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010.
222222
*/
223-
@Experimental
224-
class OnlineLDAOptimizer extends LDAOptimizer {
223+
@DeveloperApi
224+
final class OnlineLDAOptimizer extends LDAOptimizer {
225225

226226
// LDA common parameters
227227
private var k: Int = 0
@@ -243,8 +243,8 @@ class OnlineLDAOptimizer extends LDAOptimizer {
243243
private var randomGenerator: java.util.Random = null
244244

245245
// Online LDA specific parameters
246-
// Learning rate is: (tau_0 + t)^{-kappa}
247-
private var tau_0: Double = 1024
246+
// Learning rate is: (tau0 + t)^{-kappa}
247+
private var tau0: Double = 1024
248248
private var kappa: Double = 0.51
249249
private var miniBatchFraction: Double = 0.05
250250

@@ -265,16 +265,16 @@ class OnlineLDAOptimizer extends LDAOptimizer {
265265
* A (positive) learning parameter that downweights early iterations. Larger values make early
266266
* iterations count less.
267267
*/
268-
def getTau_0: Double = this.tau_0
268+
def getTau0: Double = this.tau0
269269

270270
/**
271271
* A (positive) learning parameter that downweights early iterations. Larger values make early
272272
* iterations count less.
273273
* Default: 1024, following the original Online LDA paper.
274274
*/
275-
def setTau_0(tau_0: Double): this.type = {
276-
require(tau_0 > 0, s"LDA tau_0 must be positive, but was set to $tau_0")
277-
this.tau_0 = tau_0
275+
def setTau0(tau0: Double): this.type = {
276+
require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0")
277+
this.tau0 = tau0
278278
this
279279
}
280280

@@ -434,11 +434,8 @@ class OnlineLDAOptimizer extends LDAOptimizer {
434434
* Update lambda based on the batch submitted. batchSize can be different for each iteration.
435435
*/
436436
private[clustering] def update(stat: BDM[Double], iter: Int, batchSize: Int): Unit = {
437-
val tau_0 = this.getTau_0
438-
val kappa = this.getKappa
439-
440437
// weight of the mini-batch.
441-
val weight = math.pow(tau_0 + iter, -kappa)
438+
val weight = math.pow(getTau0 + iter, -getKappa)
442439

443440
// Update lambda based on documents.
444441
lambda = lambda * (1 - weight) +

mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ public void OnlineOptimizerCompatibility() {
117117

118118
// Train a model
119119
OnlineLDAOptimizer op = new OnlineLDAOptimizer()
120-
.setTau_0(1024)
120+
.setTau0(1024)
121121
.setKappa(0.51)
122122
.setGammaShape(1e40)
123123
.setMiniBatchFraction(0.5);

mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,12 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
138138
val lda = new LDA().setK(2)
139139
val corpus = sc.parallelize(tinyCorpus, 2)
140140
val op = new OnlineLDAOptimizer().initialize(corpus, lda)
141-
op.setKappa(0.9876).setMiniBatchFraction(0.123).setTau_0(567)
141+
op.setKappa(0.9876).setMiniBatchFraction(0.123).setTau0(567)
142142
assert(op.getAlpha == 0.5) // default 1.0 / k
143143
assert(op.getEta == 0.5) // default 1.0 / k
144144
assert(op.getKappa == 0.9876)
145145
assert(op.getMiniBatchFraction == 0.123)
146-
assert(op.getTau_0 == 567)
146+
assert(op.getTau0 == 567)
147147
}
148148

149149
test("OnlineLDAOptimizer one iteration") {
@@ -159,7 +159,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
159159
val corpus = sc.parallelize(docs, 2)
160160

161161
// Set GammaShape large to avoid the stochastic impact.
162-
val op = new OnlineLDAOptimizer().setTau_0(1024).setKappa(0.51).setGammaShape(1e40)
162+
val op = new OnlineLDAOptimizer().setTau0(1024).setKappa(0.51).setGammaShape(1e40)
163163
.setMiniBatchFraction(1)
164164
val lda = new LDA().setK(k).setMaxIterations(1).setOptimizer(op).setSeed(12345)
165165

@@ -192,7 +192,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
192192
).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
193193

194194
val docs = sc.parallelize(toydata)
195-
val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau_0(1024).setKappa(0.51)
195+
val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
196196
.setGammaShape(1e10)
197197
val lda = new LDA().setK(2)
198198
.setDocConcentration(0.01)

0 commit comments

Comments
 (0)