From 3b32c8c64a1cf560857533544f58491c5c7d204d Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 28 Apr 2015 23:53:15 +0530
Subject: [PATCH 1/5] [SPARK-7045] Avoid intermediate representation when
 creating model

---
 .../apache/spark/mllib/feature/Word2Vec.scala | 80 +++++++++++--------
 .../spark/mllib/feature/Word2VecSuite.scala   |  7 ++
 2 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index f087d06d2a46a..fce2021ad1aed 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -403,17 +403,13 @@ class Word2Vec extends Serializable with Logging {
     }
     newSentences.unpersist()
 
-    val word2VecMap = mutable.HashMap.empty[String, Array[Float]]
+    val wordArray = new Array[String](vocabSize)
     var i = 0
     while (i < vocabSize) {
-      val word = bcVocab.value(i).word
-      val vector = new Array[Float](vectorSize)
-      Array.copy(syn0Global, i * vectorSize, vector, 0, vectorSize)
-      word2VecMap += word -> vector
+      wordArray(i) = bcVocab.value(i).word
       i += 1
     }
-
-    new Word2VecModel(word2VecMap.toMap)
+    new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global)
   }
 
   /**
@@ -429,38 +425,40 @@ class Word2Vec extends Serializable with Logging {
 /**
  * :: Experimental ::
  * Word2Vec model
+ *
+ * @param wordIndex: Maps each word to an index, which can retrieve the corresponding
+ *                   vector from wordVectors (see below).
+ * @param wordVectors: Array of length numWords * vectorSize, vector corresponding
+ *                     to the word mapped with index i can be retrieved by the slice
+ *                     (i * vectorSize, i * vectorSize + vectorSize)
  */
 @Experimental
-class Word2VecModel private[spark] (
-    model: Map[String, Array[Float]]) extends Serializable with Saveable {
-
-  // wordList: Ordered list of words obtained from model.
-  private val wordList: Array[String] = model.keys.toArray
+class Word2VecModel private[mllib] (
+    wordIndex: Map[String, Int],
+    wordVectors: Array[Float]) extends Serializable with Saveable {
 
-  // wordIndex: Maps each word to an index, which can retrieve the corresponding
-  //            vector from wordVectors (see below).
-  private val wordIndex: Map[String, Int] = wordList.zip(0 until model.size).toMap
-
-  // vectorSize: Dimension of each word's vector.
-  private val vectorSize = model.head._2.size
   private val numWords = wordIndex.size
+  // vectorSize: Dimension of each word's vector.
+  private val vectorSize = wordVectors.length / numWords
+
+  // wordList: Ordered list of words obtained from wordIndex.
+  private val wordList: Array[String] = wordIndex.keys.toArray
 
-  // wordVectors: Array of length numWords * vectorSize, vector corresponding to the word
-  //              mapped with index i can be retrieved by the slice
-  //              (ind * vectorSize, ind * vectorSize + vectorSize)
   // wordVecNorms: Array of length numWords, each value being the Euclidean norm
   //               of the wordVector.
-  private val (wordVectors: Array[Float], wordVecNorms: Array[Double]) = {
-    val wordVectors = new Array[Float](vectorSize * numWords)
+  private val wordVecNorms: Array[Double] = {
     val wordVecNorms = new Array[Double](numWords)
     var i = 0
     while (i < numWords) {
-      val vec = model.get(wordList(i)).get
-      Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize)
+      val vec = wordVectors.slice(i * vectorSize, i * vectorSize + vectorSize)
       wordVecNorms(i) = blas.snrm2(vectorSize, vec, 1)
       i += 1
     }
-    (wordVectors, wordVecNorms)
+    wordVecNorms
+  }
+
+  private[mllib] def this(model: Map[String, Array[Float]]) = {
+    this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model))
   }
 
   private def cosineSimilarity(v1: Array[Float], v2: Array[Float]): Double = {
@@ -484,8 +482,9 @@ class Word2VecModel private[spark] (
    * @return vector representation of word
    */
   def transform(word: String): Vector = {
-    model.get(word) match {
-      case Some(vec) =>
+    wordIndex.get(word) match {
+      case Some(ind) =>
+        val vec = wordVectors.slice(ind * vectorSize, ind * vectorSize + vectorSize)
         Vectors.dense(vec.map(_.toDouble))
       case None =>
         throw new IllegalStateException(s"$word not in vocabulary")
@@ -511,7 +510,7 @@ class Word2VecModel private[spark] (
    */
   def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
     require(num > 0, "Number of similar words should > 0")
-
+    // TODO: optimize top-k
     val fVector = vector.toArray.map(_.toFloat)
     val cosineVec = Array.fill[Float](numWords)(0)
     val alpha: Float = 1
@@ -521,13 +520,13 @@ class Word2VecModel private[spark] (
       "T", vectorSize, numWords, alpha, wordVectors, vectorSize, fVector, 1, beta, cosineVec, 1)
 
     // Need not divide with the norm of the given vector since it is constant.
-    val updatedCosines = new Array[Double](numWords)
+    val cosVec = cosineVec.map(_.toDouble)
     var ind = 0
     while (ind < numWords) {
-      updatedCosines(ind) = cosineVec(ind) / wordVecNorms(ind)
+      cosVec(ind) /= wordVecNorms(ind)
       ind += 1
     }
-    wordList.zip(updatedCosines)
+    wordList.zip(cosVec)
       .toSeq
       .sortBy(- _._2)
       .take(num + 1)
@@ -548,6 +547,23 @@ class Word2VecModel private[spark] (
 @Experimental
 object Word2VecModel extends Loader[Word2VecModel] {
 
+  private def buildWordIndex(model: Map[String, Array[Float]]) = {
+    model.keys.zipWithIndex.toMap
+  }
+
+  private def buildWordVectors(model: Map[String, Array[Float]]) = {
+    val (vectorSize, numWords) = (model.head._2.size, model.size)
+    val wordList = model.keys.toArray
+    val wordVectors = new Array[Float](vectorSize * numWords)
+    var i = 0
+    while (i < numWords) {
+      val vec = model.get(wordList(i)).get
+      Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize)
+      i += 1
+    }
+    wordVectors
+  }
+
   private object SaveLoadV1_0 {
 
     val formatVersionV1_0 = "1.0"
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index b6818369208d7..d9ad702a4a3b9 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -37,6 +37,13 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(syms.length == 2)
     assert(syms(0)._1 == "b")
     assert(syms(1)._1 == "c")
+
+    val word2VecMap = model.getVectors
+    val newModel = new Word2VecModel(word2VecMap)
+    val newSyms = newModel.findSynonyms("a", 2)
+    assert(newSyms.length == 2)
+    assert(newSyms(0)._1 == "b")
+    assert(newSyms(1)._1 == "c")
   }
 
   test("Word2VecModel") {

From b1d61c4e441d423782805dcadb017d723d812b79 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 2 Jun 2015 21:58:24 +0530
Subject: [PATCH 2/5] better errors and tests

---
 .../apache/spark/mllib/feature/Word2Vec.scala | 20 +++++++++----------
 .../spark/mllib/feature/Word2VecSuite.scala   |  7 +++----
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index fce2021ad1aed..53a1381f840c7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -403,12 +403,7 @@ class Word2Vec extends Serializable with Logging {
     }
     newSentences.unpersist()
 
-    val wordArray = new Array[String](vocabSize)
-    var i = 0
-    while (i < vocabSize) {
-      wordArray(i) = bcVocab.value(i).word
-      i += 1
-    }
+    val wordArray = vocab.map(_.word)
     new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global)
   }
 
@@ -434,16 +429,18 @@ class Word2Vec extends Serializable with Logging {
  */
 @Experimental
 class Word2VecModel private[mllib] (
-    wordIndex: Map[String, Int],
-    wordVectors: Array[Float]) extends Serializable with Saveable {
+    private val wordIndex: Map[String, Int],
+    private val wordVectors: Array[Float]) extends Serializable with Saveable {
 
   private val numWords = wordIndex.size
   // vectorSize: Dimension of each word's vector.
   private val vectorSize = wordVectors.length / numWords
 
   // wordList: Ordered list of words obtained from wordIndex.
-  private val wordList: Array[String] = wordIndex.keys.toArray
-
+  private val wordList: Array[String] = {
+    val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip
+    wl.toArray
+  }
   // wordVecNorms: Array of length numWords, each value being the Euclidean norm
   //               of the wordVector.
   private val wordVecNorms: Array[Double] = {
@@ -457,7 +454,7 @@ class Word2VecModel private[mllib] (
     wordVecNorms
   }
 
-  private[mllib] def this(model: Map[String, Array[Float]]) = {
+  def this(model: Map[String, Array[Float]]) = {
     this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model))
   }
 
@@ -552,6 +549,7 @@ object Word2VecModel extends Loader[Word2VecModel] {
   }
 
   private def buildWordVectors(model: Map[String, Array[Float]]) = {
+    require(!model.isEmpty, "Word2VecMap should be non-empty")
     val (vectorSize, numWords) = (model.head._2.size, model.size)
     val wordList = model.keys.toArray
     val wordVectors = new Array[Float](vectorSize * numWords)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index d9ad702a4a3b9..6da61739b54c6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -38,12 +38,11 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(syms(0)._1 == "b")
     assert(syms(1)._1 == "c")
 
+    // Test that model built using Word2Vec, i.e wordVectors and wordIndec
+    // and a Word2VecMap give the same values.
     val word2VecMap = model.getVectors
     val newModel = new Word2VecModel(word2VecMap)
-    val newSyms = newModel.findSynonyms("a", 2)
-    assert(newSyms.length == 2)
-    assert(newSyms(0)._1 == "b")
-    assert(newSyms(1)._1 == "c")
+    assert(newModel.getVectors.mapValues(_.toSeq) == word2VecMap.mapValues(_.toSeq))
   }
 
   test("Word2VecModel") {

From fa043131902fd5633a2ecaf5651b3414bd728669 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Sat, 20 Jun 2015 00:30:29 +0530
Subject: [PATCH 3/5] style fixes

---
 .../apache/spark/mllib/feature/Word2Vec.scala   | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 53a1381f840c7..4450631169090 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -420,18 +420,18 @@ class Word2Vec extends Serializable with Logging {
 /**
  * :: Experimental ::
  * Word2Vec model
- *
- * @param wordIndex: Maps each word to an index, which can retrieve the corresponding
- *                   vector from wordVectors (see below).
- * @param wordVectors: Array of length numWords * vectorSize, vector corresponding
- *                     to the word mapped with index i can be retrieved by the slice
- *                     (i * vectorSize, i * vectorSize + vectorSize)
  */
 @Experimental
 class Word2VecModel private[mllib] (
     private val wordIndex: Map[String, Int],
     private val wordVectors: Array[Float]) extends Serializable with Saveable {
 
+  // wordIndex: Maps each word to an index, which can retrieve the corresponding
+  //            vector from wordVectors (see below).
+  // wordVectors: Array of length numWords * vectorSize, vector corresponding
+  //              to the word mapped with index i can be retrieved by the slice
+  //              (i * vectorSize, i * vectorSize + vectorSize)
+
   private val numWords = wordIndex.size
   // vectorSize: Dimension of each word's vector.
   private val vectorSize = wordVectors.length / numWords
@@ -441,6 +441,7 @@ class Word2VecModel private[mllib] (
     val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip
     wl.toArray
   }
+
   // wordVecNorms: Array of length numWords, each value being the Euclidean norm
   //               of the wordVector.
   private val wordVecNorms: Array[Double] = {
@@ -544,11 +545,11 @@ class Word2VecModel private[mllib] (
 @Experimental
 object Word2VecModel extends Loader[Word2VecModel] {
 
-  private def buildWordIndex(model: Map[String, Array[Float]]) = {
+  private def buildWordIndex(model: Map[String, Array[Float]]): Map[String, Int] = {
     model.keys.zipWithIndex.toMap
   }
 
-  private def buildWordVectors(model: Map[String, Array[Float]]) = {
+  private def buildWordVectors(model: Map[String, Array[Float]]): Array[Float] = {
     require(!model.isEmpty, "Word2VecMap should be non-empty")
     val (vectorSize, numWords) = (model.head._2.size, model.size)
     val wordList = model.keys.toArray

From 5703116acea0f3e885061e191cb1956b7d4b2ca7 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Fri, 24 Jul 2015 22:11:57 +0530
Subject: [PATCH 4/5] minor

---
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 5 ++---
 .../scala/org/apache/spark/mllib/feature/Word2VecSuite.scala | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 4450631169090..91f863b3c39ef 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -550,14 +550,13 @@ object Word2VecModel extends Loader[Word2VecModel] {
   }
 
   private def buildWordVectors(model: Map[String, Array[Float]]): Array[Float] = {
-    require(!model.isEmpty, "Word2VecMap should be non-empty")
+    require(model.nonEmpty, "Word2VecMap should be non-empty")
     val (vectorSize, numWords) = (model.head._2.size, model.size)
     val wordList = model.keys.toArray
     val wordVectors = new Array[Float](vectorSize * numWords)
     var i = 0
     while (i < numWords) {
-      val vec = model.get(wordList(i)).get
-      Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize)
+      Array.copy(model(wordList(i)), 0, wordVectors, i * vectorSize, vectorSize)
       i += 1
     }
     wordVectors
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index 6da61739b54c6..4cc8d1129b858 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -42,7 +42,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
     // and a Word2VecMap give the same values.
     val word2VecMap = model.getVectors
     val newModel = new Word2VecModel(word2VecMap)
-    assert(newModel.getVectors.mapValues(_.toSeq) == word2VecMap.mapValues(_.toSeq))
+    assert(newModel.getVectors.mapValues(_.toSeq) === word2VecMap.mapValues(_.toSeq))
   }
 
   test("Word2VecModel") {

From e308913423c4c6019b21bcb05630268bc381fa1a Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Fri, 24 Jul 2015 23:29:46 +0530
Subject: [PATCH 5/5] move docs

---
 .../org/apache/spark/mllib/feature/Word2Vec.scala     | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 91f863b3c39ef..cbbd2b0c8d060 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -420,18 +420,17 @@ class Word2Vec extends Serializable with Logging {
 /**
  * :: Experimental ::
  * Word2Vec model
+ * @param wordIndex maps each word to an index, which can retrieve the corresponding
+ *                  vector from wordVectors
+ * @param wordVectors array of length numWords * vectorSize, vector corresponding
+ *                    to the word mapped with index i can be retrieved by the slice
+ *                    (i * vectorSize, i * vectorSize + vectorSize)
  */
 @Experimental
 class Word2VecModel private[mllib] (
     private val wordIndex: Map[String, Int],
     private val wordVectors: Array[Float]) extends Serializable with Saveable {
 
-  // wordIndex: Maps each word to an index, which can retrieve the corresponding
-  //            vector from wordVectors (see below).
-  // wordVectors: Array of length numWords * vectorSize, vector corresponding
-  //              to the word mapped with index i can be retrieved by the slice
-  //              (i * vectorSize, i * vectorSize + vectorSize)
-
   private val numWords = wordIndex.size
   // vectorSize: Dimension of each word's vector.
   private val vectorSize = wordVectors.length / numWords