From 3b32c8c64a1cf560857533544f58491c5c7d204d Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 28 Apr 2015 23:53:15 +0530 Subject: [PATCH 1/5] [SPARK-7045] Avoid intermediate representation when creating model --- .../apache/spark/mllib/feature/Word2Vec.scala | 80 +++++++++++-------- .../spark/mllib/feature/Word2VecSuite.scala | 7 ++ 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index f087d06d2a46a..fce2021ad1aed 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -403,17 +403,13 @@ class Word2Vec extends Serializable with Logging { } newSentences.unpersist() - val word2VecMap = mutable.HashMap.empty[String, Array[Float]] + val wordArray = new Array[String](vocabSize) var i = 0 while (i < vocabSize) { - val word = bcVocab.value(i).word - val vector = new Array[Float](vectorSize) - Array.copy(syn0Global, i * vectorSize, vector, 0, vectorSize) - word2VecMap += word -> vector + wordArray(i) = bcVocab.value(i).word i += 1 } - - new Word2VecModel(word2VecMap.toMap) + new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global) } /** @@ -429,38 +425,40 @@ class Word2Vec extends Serializable with Logging { /** * :: Experimental :: * Word2Vec model + * + * @param wordIndex: Maps each word to an index, which can retrieve the corresponding + * vector from wordVectors (see below). + * @param wordVectors: Array of length numWords * vectorSize, vector corresponding + * to the word mapped with index i can be retrieved by the slice + * (i * vectorSize, i * vectorSize + vectorSize) */ @Experimental -class Word2VecModel private[spark] ( - model: Map[String, Array[Float]]) extends Serializable with Saveable { - - // wordList: Ordered list of words obtained from model. - private val wordList: Array[String] = model.keys.toArray +class Word2VecModel private[mllib] ( + wordIndex: Map[String, Int], + wordVectors: Array[Float]) extends Serializable with Saveable { - // wordIndex: Maps each word to an index, which can retrieve the corresponding - // vector from wordVectors (see below). - private val wordIndex: Map[String, Int] = wordList.zip(0 until model.size).toMap - - // vectorSize: Dimension of each word's vector. - private val vectorSize = model.head._2.size private val numWords = wordIndex.size + // vectorSize: Dimension of each word's vector. + private val vectorSize = wordVectors.length / numWords + + // wordList: Ordered list of words obtained from wordIndex. + private val wordList: Array[String] = wordIndex.keys.toArray - // wordVectors: Array of length numWords * vectorSize, vector corresponding to the word - // mapped with index i can be retrieved by the slice - // (ind * vectorSize, ind * vectorSize + vectorSize) // wordVecNorms: Array of length numWords, each value being the Euclidean norm // of the wordVector. - private val (wordVectors: Array[Float], wordVecNorms: Array[Double]) = { - val wordVectors = new Array[Float](vectorSize * numWords) + private val wordVecNorms: Array[Double] = { val wordVecNorms = new Array[Double](numWords) var i = 0 while (i < numWords) { - val vec = model.get(wordList(i)).get - Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize) + val vec = wordVectors.slice(i * vectorSize, i * vectorSize + vectorSize) wordVecNorms(i) = blas.snrm2(vectorSize, vec, 1) i += 1 } - (wordVectors, wordVecNorms) + wordVecNorms + } + + private[mllib] def this(model: Map[String, Array[Float]]) = { + this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model)) } private def cosineSimilarity(v1: Array[Float], v2: Array[Float]): Double = { @@ -484,8 +482,9 @@ class Word2VecModel private[spark] ( * @return vector representation of word */ def transform(word: String): Vector = { - model.get(word) match { - case Some(vec) => + wordIndex.get(word) match { + case Some(ind) => + val vec = wordVectors.slice(ind * vectorSize, ind * vectorSize + vectorSize) Vectors.dense(vec.map(_.toDouble)) case None => throw new IllegalStateException(s"$word not in vocabulary") @@ -511,7 +510,7 @@ class Word2VecModel private[spark] ( */ def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = { require(num > 0, "Number of similar words should > 0") - + // TODO: optimize top-k val fVector = vector.toArray.map(_.toFloat) val cosineVec = Array.fill[Float](numWords)(0) val alpha: Float = 1 @@ -521,13 +520,13 @@ class Word2VecModel private[spark] ( "T", vectorSize, numWords, alpha, wordVectors, vectorSize, fVector, 1, beta, cosineVec, 1) // Need not divide with the norm of the given vector since it is constant. - val updatedCosines = new Array[Double](numWords) + val cosVec = cosineVec.map(_.toDouble) var ind = 0 while (ind < numWords) { - updatedCosines(ind) = cosineVec(ind) / wordVecNorms(ind) + cosVec(ind) /= wordVecNorms(ind) ind += 1 } - wordList.zip(updatedCosines) + wordList.zip(cosVec) .toSeq .sortBy(- _._2) .take(num + 1) @@ -548,6 +547,23 @@ class Word2VecModel private[spark] ( @Experimental object Word2VecModel extends Loader[Word2VecModel] { + private def buildWordIndex(model: Map[String, Array[Float]]) = { + model.keys.zipWithIndex.toMap + } + + private def buildWordVectors(model: Map[String, Array[Float]]) = { + val (vectorSize, numWords) = (model.head._2.size, model.size) + val wordList = model.keys.toArray + val wordVectors = new Array[Float](vectorSize * numWords) + var i = 0 + while (i < numWords) { + val vec = model.get(wordList(i)).get + Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize) + i += 1 + } + wordVectors + } + private object SaveLoadV1_0 { val formatVersionV1_0 = "1.0" diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index b6818369208d7..d9ad702a4a3b9 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -37,6 +37,13 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { assert(syms.length == 2) assert(syms(0)._1 == "b") assert(syms(1)._1 == "c") + + val word2VecMap = model.getVectors + val newModel = new Word2VecModel(word2VecMap) + val newSyms = newModel.findSynonyms("a", 2) + assert(newSyms.length == 2) + assert(newSyms(0)._1 == "b") + assert(newSyms(1)._1 == "c") } test("Word2VecModel") { From b1d61c4e441d423782805dcadb017d723d812b79 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 2 Jun 2015 21:58:24 +0530 Subject: [PATCH 2/5] better errors and tests --- .../apache/spark/mllib/feature/Word2Vec.scala | 20 +++++++++---------- .../spark/mllib/feature/Word2VecSuite.scala | 7 +++---- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index fce2021ad1aed..53a1381f840c7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -403,12 +403,7 @@ class Word2Vec extends Serializable with Logging { } newSentences.unpersist() - val wordArray = new Array[String](vocabSize) - var i = 0 - while (i < vocabSize) { - wordArray(i) = bcVocab.value(i).word - i += 1 - } + val wordArray = vocab.map(_.word) new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global) } @@ -434,16 +429,18 @@ class Word2Vec extends Serializable with Logging { */ @Experimental class Word2VecModel private[mllib] ( - wordIndex: Map[String, Int], - wordVectors: Array[Float]) extends Serializable with Saveable { + private val wordIndex: Map[String, Int], + private val wordVectors: Array[Float]) extends Serializable with Saveable { private val numWords = wordIndex.size // vectorSize: Dimension of each word's vector. private val vectorSize = wordVectors.length / numWords // wordList: Ordered list of words obtained from wordIndex. - private val wordList: Array[String] = wordIndex.keys.toArray - + private val wordList: Array[String] = { + val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip + wl.toArray + } // wordVecNorms: Array of length numWords, each value being the Euclidean norm // of the wordVector. private val wordVecNorms: Array[Double] = { @@ -457,7 +454,7 @@ class Word2VecModel private[mllib] ( wordVecNorms } - private[mllib] def this(model: Map[String, Array[Float]]) = { + def this(model: Map[String, Array[Float]]) = { this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model)) } @@ -552,6 +549,7 @@ object Word2VecModel extends Loader[Word2VecModel] { } private def buildWordVectors(model: Map[String, Array[Float]]) = { + require(!model.isEmpty, "Word2VecMap should be non-empty") val (vectorSize, numWords) = (model.head._2.size, model.size) val wordList = model.keys.toArray val wordVectors = new Array[Float](vectorSize * numWords) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index d9ad702a4a3b9..6da61739b54c6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -38,12 +38,11 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { assert(syms(0)._1 == "b") assert(syms(1)._1 == "c") + // Test that model built using Word2Vec, i.e wordVectors and wordIndec + // and a Word2VecMap give the same values. val word2VecMap = model.getVectors val newModel = new Word2VecModel(word2VecMap) - val newSyms = newModel.findSynonyms("a", 2) - assert(newSyms.length == 2) - assert(newSyms(0)._1 == "b") - assert(newSyms(1)._1 == "c") + assert(newModel.getVectors.mapValues(_.toSeq) == word2VecMap.mapValues(_.toSeq)) } test("Word2VecModel") { From fa043131902fd5633a2ecaf5651b3414bd728669 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Sat, 20 Jun 2015 00:30:29 +0530 Subject: [PATCH 3/5] style fixes --- .../apache/spark/mllib/feature/Word2Vec.scala | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 53a1381f840c7..4450631169090 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -420,18 +420,18 @@ class Word2Vec extends Serializable with Logging { /** * :: Experimental :: * Word2Vec model - * - * @param wordIndex: Maps each word to an index, which can retrieve the corresponding - * vector from wordVectors (see below). - * @param wordVectors: Array of length numWords * vectorSize, vector corresponding - * to the word mapped with index i can be retrieved by the slice - * (i * vectorSize, i * vectorSize + vectorSize) */ @Experimental class Word2VecModel private[mllib] ( private val wordIndex: Map[String, Int], private val wordVectors: Array[Float]) extends Serializable with Saveable { + // wordIndex: Maps each word to an index, which can retrieve the corresponding + // vector from wordVectors (see below). + // wordVectors: Array of length numWords * vectorSize, vector corresponding + // to the word mapped with index i can be retrieved by the slice + // (i * vectorSize, i * vectorSize + vectorSize) + private val numWords = wordIndex.size // vectorSize: Dimension of each word's vector. private val vectorSize = wordVectors.length / numWords @@ -441,6 +441,7 @@ class Word2VecModel private[mllib] ( val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip wl.toArray } + // wordVecNorms: Array of length numWords, each value being the Euclidean norm // of the wordVector. private val wordVecNorms: Array[Double] = { @@ -544,11 +545,11 @@ class Word2VecModel private[mllib] ( @Experimental object Word2VecModel extends Loader[Word2VecModel] { - private def buildWordIndex(model: Map[String, Array[Float]]) = { + private def buildWordIndex(model: Map[String, Array[Float]]): Map[String, Int] = { model.keys.zipWithIndex.toMap } - private def buildWordVectors(model: Map[String, Array[Float]]) = { + private def buildWordVectors(model: Map[String, Array[Float]]): Array[Float] = { require(!model.isEmpty, "Word2VecMap should be non-empty") val (vectorSize, numWords) = (model.head._2.size, model.size) val wordList = model.keys.toArray From 5703116acea0f3e885061e191cb1956b7d4b2ca7 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 24 Jul 2015 22:11:57 +0530 Subject: [PATCH 4/5] minor --- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 5 ++--- .../scala/org/apache/spark/mllib/feature/Word2VecSuite.scala | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 4450631169090..91f863b3c39ef 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -550,14 +550,13 @@ object Word2VecModel extends Loader[Word2VecModel] { } private def buildWordVectors(model: Map[String, Array[Float]]): Array[Float] = { - require(!model.isEmpty, "Word2VecMap should be non-empty") + require(model.nonEmpty, "Word2VecMap should be non-empty") val (vectorSize, numWords) = (model.head._2.size, model.size) val wordList = model.keys.toArray val wordVectors = new Array[Float](vectorSize * numWords) var i = 0 while (i < numWords) { - val vec = model.get(wordList(i)).get - Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize) + Array.copy(model(wordList(i)), 0, wordVectors, i * vectorSize, vectorSize) i += 1 } wordVectors diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index 6da61739b54c6..4cc8d1129b858 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -42,7 +42,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { // and a Word2VecMap give the same values. val word2VecMap = model.getVectors val newModel = new Word2VecModel(word2VecMap) - assert(newModel.getVectors.mapValues(_.toSeq) == word2VecMap.mapValues(_.toSeq)) + assert(newModel.getVectors.mapValues(_.toSeq) === word2VecMap.mapValues(_.toSeq)) } test("Word2VecModel") { From e308913423c4c6019b21bcb05630268bc381fa1a Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 24 Jul 2015 23:29:46 +0530 Subject: [PATCH 5/5] move docs --- .../org/apache/spark/mllib/feature/Word2Vec.scala | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 91f863b3c39ef..cbbd2b0c8d060 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -420,18 +420,17 @@ class Word2Vec extends Serializable with Logging { /** * :: Experimental :: * Word2Vec model + * @param wordIndex maps each word to an index, which can retrieve the corresponding + * vector from wordVectors + * @param wordVectors array of length numWords * vectorSize, vector corresponding + * to the word mapped with index i can be retrieved by the slice + * (i * vectorSize, i * vectorSize + vectorSize) */ @Experimental class Word2VecModel private[mllib] ( private val wordIndex: Map[String, Int], private val wordVectors: Array[Float]) extends Serializable with Saveable { - // wordIndex: Maps each word to an index, which can retrieve the corresponding - // vector from wordVectors (see below). - // wordVectors: Array of length numWords * vectorSize, vector corresponding - // to the word mapped with index i can be retrieved by the slice - // (i * vectorSize, i * vectorSize + vectorSize) - private val numWords = wordIndex.size // vectorSize: Dimension of each word's vector. private val vectorSize = wordVectors.length / numWords