[SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer

Pravin Gadakh · srowen · commit e24923267f79 · 2016-04-15T13:08:30.000+01:00
## What changes were proposed in this pull request? Removed duplicated generation of `ids` in OnlineLDAOptimizer. ## How was this patch tested? tested with existing unit tests. Author: Pravin Gadakh <prgadakh@in.ibm.com> Closes #12176 from pravingadakh/SPARK-14370.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -303,7 +303,7 @@ class LocalLDAModel private[spark] (
       documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) =>
         val localElogbeta = ElogbetaBc.value
         var docBound = 0.0D
-        val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference(
+        val (gammad: BDV[Double], _, _) = OnlineLDAOptimizer.variationalTopicInference(
           termCounts, exp(localElogbeta), brzAlpha, gammaShape, k)
         val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad)
 
@@ -354,7 +354,7 @@ class LocalLDAModel private[spark] (
       if (termCounts.numNonzeros == 0) {
         (id, Vectors.zeros(k))
       } else {
-        val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+        val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
           termCounts,
           expElogbetaBc.value,
           docConcentrationBrz,
@@ -377,7 +377,7 @@ class LocalLDAModel private[spark] (
       if (termCounts.numNonzeros == 0) {
         Vectors.zeros(k)
       } else {
-        val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+        val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
           termCounts,
           expElogbetaBc.value,
           docConcentrationBrz,
@@ -403,7 +403,7 @@ class LocalLDAModel private[spark] (
     if (document.numNonzeros == 0) {
       Vectors.zeros(this.k)
     } else {
-      val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+      val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
         document,
         expElogbeta,
         this.docConcentration.toBreeze,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -466,11 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
       val stat = BDM.zeros[Double](k, vocabSize)
       var gammaPart = List[BDV[Double]]()
       nonEmptyDocs.foreach { case (_, termCounts: Vector) =>
-        val ids: List[Int] = termCounts match {
-          case v: DenseVector => (0 until v.size).toList
-          case v: SparseVector => v.indices.toList
-        }
-        val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference(
+        val (gammad, sstats, ids) = OnlineLDAOptimizer.variationalTopicInference(
           termCounts, expElogbetaBc.value, alpha, gammaShape, k)
         stat(::, ids) := stat(::, ids).toDenseMatrix + sstats
         gammaPart = gammad :: gammaPart
@@ -563,13 +559,16 @@ private[clustering] object OnlineLDAOptimizer {
    * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
    * avoids explicit computation of variational parameter `phi`.
    * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
+   *
+   * @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` -
+   *         statistics for updating lambda and `ids` - list of termCounts vector indices.
    */
   private[clustering] def variationalTopicInference(
       termCounts: Vector,
       expElogbeta: BDM[Double],
       alpha: breeze.linalg.Vector[Double],
       gammaShape: Double,
-      k: Int): (BDV[Double], BDM[Double]) = {
+      k: Int): (BDV[Double], BDM[Double], List[Int]) = {
     val (ids: List[Int], cts: Array[Double]) = termCounts match {
       case v: DenseVector => ((0 until v.size).toList, v.values)
       case v: SparseVector => (v.indices.toList, v.values)
@@ -596,6 +595,6 @@ private[clustering] object OnlineLDAOptimizer {
     }
 
     val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phiNorm).asDenseMatrix
-    (gammad, sstatsd)
+    (gammad, sstatsd, ids)
   }
 }