Skip to content

Commit e249232

Browse files
Pravin Gadakhsrowen
authored andcommitted
[SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer
## What changes were proposed in this pull request? Removed duplicated generation of `ids` in OnlineLDAOptimizer. ## How was this patch tested? tested with existing unit tests. Author: Pravin Gadakh <[email protected]> Closes #12176 from pravingadakh/SPARK-14370.
1 parent 96534aa commit e249232

File tree

2 files changed

+10
-11
lines changed

2 files changed

+10
-11
lines changed

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ class LocalLDAModel private[spark] (
303303
documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) =>
304304
val localElogbeta = ElogbetaBc.value
305305
var docBound = 0.0D
306-
val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference(
306+
val (gammad: BDV[Double], _, _) = OnlineLDAOptimizer.variationalTopicInference(
307307
termCounts, exp(localElogbeta), brzAlpha, gammaShape, k)
308308
val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad)
309309

@@ -354,7 +354,7 @@ class LocalLDAModel private[spark] (
354354
if (termCounts.numNonzeros == 0) {
355355
(id, Vectors.zeros(k))
356356
} else {
357-
val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
357+
val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
358358
termCounts,
359359
expElogbetaBc.value,
360360
docConcentrationBrz,
@@ -377,7 +377,7 @@ class LocalLDAModel private[spark] (
377377
if (termCounts.numNonzeros == 0) {
378378
Vectors.zeros(k)
379379
} else {
380-
val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
380+
val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
381381
termCounts,
382382
expElogbetaBc.value,
383383
docConcentrationBrz,
@@ -403,7 +403,7 @@ class LocalLDAModel private[spark] (
403403
if (document.numNonzeros == 0) {
404404
Vectors.zeros(this.k)
405405
} else {
406-
val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
406+
val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
407407
document,
408408
expElogbeta,
409409
this.docConcentration.toBreeze,

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -466,11 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
466466
val stat = BDM.zeros[Double](k, vocabSize)
467467
var gammaPart = List[BDV[Double]]()
468468
nonEmptyDocs.foreach { case (_, termCounts: Vector) =>
469-
val ids: List[Int] = termCounts match {
470-
case v: DenseVector => (0 until v.size).toList
471-
case v: SparseVector => v.indices.toList
472-
}
473-
val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference(
469+
val (gammad, sstats, ids) = OnlineLDAOptimizer.variationalTopicInference(
474470
termCounts, expElogbetaBc.value, alpha, gammaShape, k)
475471
stat(::, ids) := stat(::, ids).toDenseMatrix + sstats
476472
gammaPart = gammad :: gammaPart
@@ -563,13 +559,16 @@ private[clustering] object OnlineLDAOptimizer {
563559
* An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
564560
* avoids explicit computation of variational parameter `phi`.
565561
* @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
562+
*
563+
* @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` -
564+
* statistics for updating lambda and `ids` - list of termCounts vector indices.
566565
*/
567566
private[clustering] def variationalTopicInference(
568567
termCounts: Vector,
569568
expElogbeta: BDM[Double],
570569
alpha: breeze.linalg.Vector[Double],
571570
gammaShape: Double,
572-
k: Int): (BDV[Double], BDM[Double]) = {
571+
k: Int): (BDV[Double], BDM[Double], List[Int]) = {
573572
val (ids: List[Int], cts: Array[Double]) = termCounts match {
574573
case v: DenseVector => ((0 until v.size).toList, v.values)
575574
case v: SparseVector => (v.indices.toList, v.values)
@@ -596,6 +595,6 @@ private[clustering] object OnlineLDAOptimizer {
596595
}
597596

598597
val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phiNorm).asDenseMatrix
599-
(gammad, sstatsd)
598+
(gammad, sstatsd, ids)
600599
}
601600
}

0 commit comments

Comments
 (0)