@@ -117,15 +117,22 @@ Word2Vec is implemented in [Word2Vec](api/scala/index.html#org.apache.spark.ml.f
117117{% highlight scala %}
118118import org.apache.spark.ml.feature.Word2Vec
119119
120+ // Input data: Each row is a bag of words from a sentence or document.
120121val documentDF = sqlContext.createDataFrame(Seq(
121122 "Hi I heard about Spark".split(" "),
122123 "I wish Java could use case classes".split(" "),
123124 "Logistic regression models are neat".split(" ")
124- )) .map(Tuple1.apply).toDF("text")
125+ ).map(Tuple1.apply) ).toDF("text")
125126
126- val word2Vec = new Word2Vec.setInputCol("text").setOutputCol("result").setVectorSize(3)
127+ // Learn a mapping from words to Vectors.
128+ val word2Vec = new Word2Vec()
129+ .setInputCol("text")
130+ .setOutputCol("result")
131+ .setVectorSize(3)
132+ .setMinCount(0)
127133val model = word2Vec.fit(documentDF)
128- val result = model.transform(documentDF).select("result").take(3).foreach(println)
134+ val result = model.transform(documentDF)
135+ result.select("result").take(3).foreach(println)
129136{% endhighlight %}
130137</div >
131138
@@ -143,24 +150,26 @@ import org.apache.spark.sql.types.*;
143150
144151JavaSparkContext jsc = ...
145152SQLContext sqlContext = ...
153+
154+ // Input data: Each row is a bag of words from a sentence or document.
146155JavaRDD<Row > jrdd = jsc.parallelize(Lists.newArrayList(
147156 RowFactory.create(Lists.newArrayList("Hi I heard about Spark".split(" "))),
148157 RowFactory.create(Lists.newArrayList("I wish Java could use case classes".split(" "))),
149158 RowFactory.create(Lists.newArrayList("Logistic regression models are neat".split(" ")))
150159));
151160StructType schema = new StructType(new StructField[ ] {
152- new StructField("text", new ArrayType(StringType$.MODULE$ , true), false, Metadata.empty())
161+ new StructField("text", new ArrayType(DataTypes.StringType , true), false, Metadata.empty())
153162});
154163DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
155164
165+ // Learn a mapping from words to Vectors.
156166Word2Vec word2Vec = new Word2Vec()
157167 .setInputCol("text")
158168 .setOutputCol("result")
159169 .setVectorSize(3)
160170 .setMinCount(0);
161171Word2VecModel model = word2Vec.fit(documentDF);
162172DataFrame result = model.transform(documentDF);
163-
164173for (Row r: result.select("result").take(3)) {
165174 System.out.println(r);
166175}
@@ -171,12 +180,14 @@ for (Row r: result.select("result").take(3)) {
171180{% highlight python %}
172181from pyspark.ml.feature import Word2Vec
173182
183+ # Input data: Each row is a bag of words from a sentence or document.
174184documentDF = sqlContext.createDataFrame([
175185 ("Hi I heard about Spark".split(" "), ),
176186 ("I wish Java could use case classes".split(" "), ),
177187 ("Logistic regression models are neat".split(" "), )
178188] , [ "text"] )
179- word2Vec = Word2Vec(vectorSize = 3, minCount = 0, inputCol = "text", outputCol = "result")
189+ # Learn a mapping from words to Vectors.
190+ word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
180191model = word2Vec.fit(documentDF)
181192result = model.transform(documentDF)
182193for feature in result.select("result").take(3):
0 commit comments