Skip to content

Commit 7e60b76

Browse files
committed
Changes of the examples after API improvements
1 parent a78d920 commit 7e60b76

11 files changed

+75
-82
lines changed

docs/ml-features.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1514,15 +1514,15 @@ Bucketed Random Projection accepts arbitrary vectors as input features, and supp
15141514
Refer to the [RandomProjection Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RandomProjection)
15151515
for more details on the API.
15161516

1517-
{% include_example scala/org/apache/spark/examples/ml/RandomProjectionExample.scala %}
1517+
{% include_example scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala %}
15181518
</div>
15191519

15201520
<div data-lang="java" markdown="1">
15211521

15221522
Refer to the [RandomProjection Java docs](api/java/org/apache/spark/ml/feature/RandomProjection.html)
15231523
for more details on the API.
15241524

1525-
{% include_example java/org/apache/spark/examples/ml/JavaRandomProjectionExample.java %}
1525+
{% include_example java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java %}
15261526
</div>
15271527
</div>
15281528

@@ -1546,15 +1546,15 @@ The input sets for MinHash are represented as binary vectors, where the vector i
15461546
Refer to the [MinHash Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinHash)
15471547
for more details on the API.
15481548

1549-
{% include_example scala/org/apache/spark/examples/ml/MinHashExample.scala %}
1549+
{% include_example scala/org/apache/spark/examples/ml/MinHashLSHExample.scala %}
15501550
</div>
15511551

15521552
<div data-lang="java" markdown="1">
15531553

15541554
Refer to the [MinHash Java docs](api/java/org/apache/spark/ml/feature/MinHash.html)
15551555
for more details on the API.
15561556

1557-
{% include_example java/org/apache/spark/examples/ml/JavaMinHashExample.java %}
1557+
{% include_example java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java %}
15581558
</div>
15591559
</div>
15601560

examples/src/main/java/org/apache/spark/examples/ml/JavaApproxNearestNeighborExample.java

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
import java.util.Arrays;
2424
import java.util.List;
2525

26-
import org.apache.spark.ml.feature.MinHash;
27-
import org.apache.spark.ml.feature.MinHashModel;
26+
import org.apache.spark.ml.feature.MinHashLSH;
27+
import org.apache.spark.ml.feature.MinHashLSHModel;
2828
import org.apache.spark.ml.linalg.Vector;
2929
import org.apache.spark.ml.linalg.VectorUDT;
3030
import org.apache.spark.ml.linalg.Vectors;
@@ -57,27 +57,15 @@ public static void main(String[] args) {
5757
});
5858
Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
5959

60-
MinHash mh = new MinHash()
61-
.setOutputDim(5)
60+
MinHashLSH mh = new MinHashLSH()
61+
.setNumHashTables(5)
6262
.setInputCol("keys")
6363
.setOutputCol("values");
6464

6565
Vector key1 = Vectors.sparse(6, new int[]{1, 3}, new double[]{1.0, 1.0, 1.0});
66-
Vector key2 = Vectors.sparse(6, new int[]{5}, new double[]{1.0, 1.0, 1.0});
6766

68-
MinHashModel model = mh.fit(dataFrame);
67+
MinHashLSHModel model = mh.fit(dataFrame);
6968
model.approxNearestNeighbors(dataFrame, key1, 2).show();
70-
71-
System.out.println("Difference between single probing and multi probing:");
72-
73-
System.out.println("Single probing sometimes returns less than k rows");
74-
model.approxNearestNeighbors(dataFrame, key2, 3, true, "distCol").show();
75-
76-
System.out.println("Multi probing returns exact k rows whenever possible");
77-
model.approxNearestNeighbors(dataFrame, key2, 3, false, "distCol").show();
78-
79-
System.out.println("Multi probing returns the whole dataset when there are not enough rows");
80-
model.approxNearestNeighbors(dataFrame, key2, 4, false, "distCol").show();
8169
// $example off$
8270

8371
spark.stop();

examples/src/main/java/org/apache/spark/examples/ml/JavaApproxSimilarityJoinExample.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
import java.util.Arrays;
2424
import java.util.List;
2525

26-
import org.apache.spark.ml.feature.MinHash;
27-
import org.apache.spark.ml.feature.MinHashModel;
26+
import org.apache.spark.ml.feature.MinHashLSH;
27+
import org.apache.spark.ml.feature.MinHashLSHModel;
2828
import org.apache.spark.ml.linalg.VectorUDT;
2929
import org.apache.spark.ml.linalg.Vectors;
3030
import org.apache.spark.sql.Dataset;
@@ -63,12 +63,12 @@ public static void main(String[] args) {
6363
Dataset<Row> dfA = spark.createDataFrame(dataA, schema);
6464
Dataset<Row> dfB = spark.createDataFrame(dataB, schema);
6565

66-
MinHash mh = new MinHash()
67-
.setOutputDim(5)
66+
MinHashLSH mh = new MinHashLSH()
67+
.setNumHashTables(5)
6868
.setInputCol("keys")
6969
.setOutputCol("values");
7070

71-
MinHashModel model = mh.fit(dfA);
71+
MinHashLSHModel model = mh.fit(dfA);
7272
model.approxSimilarityJoin(dfA, dfB, 0.6).show();
7373

7474
// Cache the transformed columns

examples/src/main/java/org/apache/spark/examples/ml/JavaRandomProjectionExample.java renamed to examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
import java.util.Arrays;
2424
import java.util.List;
2525

26-
import org.apache.spark.ml.feature.RandomProjection;
27-
import org.apache.spark.ml.feature.RandomProjectionModel;
26+
import org.apache.spark.ml.feature.BucketedRandomProjectionLSH;
27+
import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel;
2828
import org.apache.spark.ml.linalg.VectorUDT;
2929
import org.apache.spark.ml.linalg.Vectors;
3030
import org.apache.spark.sql.Dataset;
@@ -36,11 +36,11 @@
3636
import org.apache.spark.sql.types.StructType;
3737
// $example off$
3838

39-
public class JavaRandomProjectionExample {
39+
public class JavaBucketedRandomProjectionLSHExample {
4040
public static void main(String[] args) {
4141
SparkSession spark = SparkSession
4242
.builder()
43-
.appName("JavaRandomProjectionExample")
43+
.appName("JavaBucketedRandomProjectionLSHExample")
4444
.getOrCreate();
4545

4646
// $example on$
@@ -57,13 +57,13 @@ public static void main(String[] args) {
5757
});
5858
Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
5959

60-
RandomProjection mh = new RandomProjection()
60+
BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH()
6161
.setBucketLength(2.0)
62-
.setOutputDim(1)
62+
.setNumHashTables(1)
6363
.setInputCol("keys")
6464
.setOutputCol("values");
6565

66-
RandomProjectionModel model = mh.fit(dataFrame);
66+
BucketedRandomProjectionLSHModel model = mh.fit(dataFrame);
6767
model.transform(dataFrame).show();
6868
// $example off$
6969

examples/src/main/java/org/apache/spark/examples/ml/JavaLSHTransformationExample.java

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
import java.util.Arrays;
2424
import java.util.List;
2525

26-
import org.apache.spark.ml.feature.MinHash;
27-
import org.apache.spark.ml.feature.MinHashModel;
26+
import org.apache.spark.ml.feature.MinHashLSH;
27+
import org.apache.spark.ml.feature.MinHashLSHModel;
2828
import org.apache.spark.ml.linalg.VectorUDT;
2929
import org.apache.spark.ml.linalg.Vectors;
3030
import org.apache.spark.sql.Dataset;
@@ -57,20 +57,19 @@ public static void main(String[] args) {
5757
Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
5858

5959
// Single LSH hashing
60-
MinHash mhSingleHash = new MinHash()
61-
.setOutputDim(1)
60+
MinHashLSH mhSingleHash = new MinHashLSH()
6261
.setInputCol("keys")
6362
.setOutputCol("values");
64-
MinHashModel modelSingleHash = mhSingleHash.fit(dataFrame);
63+
MinHashLSHModel modelSingleHash = mhSingleHash.fit(dataFrame);
6564
// Feature transformation: add a new column for a hash value
6665
modelSingleHash.transform(dataFrame).show();
6766

6867
// Use more than 1 hash functions
69-
MinHash mh = new MinHash()
70-
.setOutputDim(5)
68+
MinHashLSH mh = new MinHashLSH()
69+
.setNumHashTables(5)
7170
.setInputCol("keys")
7271
.setOutputCol("values");
73-
MinHashModel model = mh.fit(dataFrame);
72+
MinHashLSHModel model = mh.fit(dataFrame);
7473
// Feature Transformation: add a new column for multiple hash values
7574
model.transform(dataFrame).show();
7675
// $example off$

examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashExample.java renamed to examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,30 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
120
import org.apache.spark.sql.SparkSession;
221

322
// $example on$
423
import java.util.Arrays;
524
import java.util.List;
625

7-
import org.apache.spark.ml.feature.MinHash;
8-
import org.apache.spark.ml.feature.MinHashModel;
26+
import org.apache.spark.ml.feature.MinHashLSH;
27+
import org.apache.spark.ml.feature.MinHashLSHModel;
928
import org.apache.spark.ml.linalg.VectorUDT;
1029
import org.apache.spark.ml.linalg.Vectors;
1130
import org.apache.spark.sql.Dataset;
@@ -17,11 +36,11 @@
1736
import org.apache.spark.sql.types.StructType;
1837
// $example off$
1938

20-
public class JavaMinHashExample {
39+
public class JavaMinHashLSHExample {
2140
public static void main(String[] args) {
2241
SparkSession spark = SparkSession
2342
.builder()
24-
.appName("JavaMinHashExample")
43+
.appName("JavaMinHashLSHExample")
2544
.getOrCreate();
2645

2746
// $example on$
@@ -37,12 +56,12 @@ public static void main(String[] args) {
3756
});
3857
Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
3958

40-
MinHash mh = new MinHash()
41-
.setOutputDim(1)
59+
MinHashLSH mh = new MinHashLSH()
60+
.setNumHashTables(1)
4261
.setInputCol("keys")
4362
.setOutputCol("values");
4463

45-
MinHashModel model = mh.fit(dataFrame);
64+
MinHashLSHModel model = mh.fit(dataFrame);
4665
model.transform(dataFrame).show();
4766
// $example off$
4867

examples/src/main/scala/org/apache/spark/examples/ml/ApproxNearestNeighborExample.scala

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
package org.apache.spark.examples.ml
2020

2121
// $example on$
22-
import org.apache.spark.ml.feature.MinHash
22+
import org.apache.spark.ml.feature.MinHashLSH
2323
import org.apache.spark.ml.linalg.Vectors
2424
// $example off$
2525
import org.apache.spark.sql.SparkSession
@@ -39,27 +39,15 @@ object ApproxNearestNeighborExample {
3939
(2, Vectors.sparse(6, Seq((0, 1.0), (2, 1.0), (4, 1.0))))
4040
)).toDF("id", "keys")
4141

42-
val mh = new MinHash()
43-
.setOutputDim(5)
42+
val mh = new MinHashLSH()
43+
.setNumHashTables(5)
4444
.setInputCol("keys")
4545
.setOutputCol("values")
4646

4747
val key1 = Vectors.sparse(6, Seq((1, 1.0), (3, 1.0)))
48-
val key2 = Vectors.sparse(6, Seq((5, 1.0)))
4948

5049
val model = mh.fit(dataFrame)
5150
model.approxNearestNeighbors(dataFrame, key1, 2).show()
52-
53-
println("Difference between single probing and multi probing:")
54-
55-
println("single probing sometimes returns less than k rows")
56-
model.approxNearestNeighbors(dataFrame, key2, 3, true, "distCol").show()
57-
58-
println("multi probing returns exact k rows whenever possible")
59-
model.approxNearestNeighbors(dataFrame, key2, 3, false, "distCol").show()
60-
61-
println("multi probing returns the whole dataset when there are not enough rows")
62-
model.approxNearestNeighbors(dataFrame, key2, 4, false, "distCol").show()
6351
// $example off$
6452

6553
spark.stop()

examples/src/main/scala/org/apache/spark/examples/ml/ApproxSimilarityJoinExample.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
package org.apache.spark.examples.ml
2020

2121
// $example on$
22-
import org.apache.spark.ml.feature.MinHash
22+
import org.apache.spark.ml.feature.MinHashLSH
2323
import org.apache.spark.ml.linalg.Vectors
2424
// $example off$
2525
import org.apache.spark.sql.SparkSession
@@ -45,8 +45,8 @@ object ApproxSimilarityJoinExample {
4545
(5, Vectors.sparse(6, Seq((1, 1.0), (2, 1.0), (4, 1.0))))
4646
)).toDF("id", "keys")
4747

48-
val mh = new MinHash()
49-
.setOutputDim(5)
48+
val mh = new MinHashLSH()
49+
.setNumHashTables(5)
5050
.setInputCol("keys")
5151
.setOutputCol("values")
5252

examples/src/main/scala/org/apache/spark/examples/ml/RandomProjectionExample.scala renamed to examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,17 @@
1919
package org.apache.spark.examples.ml
2020

2121
// $example on$
22-
import org.apache.spark.ml.feature.RandomProjection
22+
import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
2323
import org.apache.spark.ml.linalg.Vectors
2424
// $example off$
2525
import org.apache.spark.sql.SparkSession
2626

27-
object RandomProjectionExample {
27+
object BucketedRandomProjectionLSHExample {
2828
def main(args: Array[String]): Unit = {
2929
// Creates a SparkSession
3030
val spark = SparkSession
3131
.builder
32-
.appName("RandomProjectionExample")
32+
.appName("BucketedRandomProjectionLSHExample")
3333
.getOrCreate()
3434

3535
// $example on$
@@ -40,9 +40,9 @@ object RandomProjectionExample {
4040
(2, Vectors.dense(-1.0, 1.0))
4141
)).toDF("id", "keys")
4242

43-
val rp = new RandomProjection()
43+
val rp = new BucketedRandomProjectionLSH()
4444
.setBucketLength(2.0)
45-
.setOutputDim(1)
45+
.setNumHashTables(1)
4646
.setInputCol("keys")
4747
.setOutputCol("values")
4848

examples/src/main/scala/org/apache/spark/examples/ml/LSHTransformationExample.scala

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
package org.apache.spark.examples.ml
2020

2121
// $example on$
22-
import org.apache.spark.ml.feature.MinHash
22+
import org.apache.spark.ml.feature.MinHashLSH
2323
import org.apache.spark.ml.linalg.Vectors
2424
// $example off$
2525
import org.apache.spark.sql.SparkSession
@@ -40,17 +40,16 @@ object LSHTransformationExample {
4040
)).toDF("id", "keys")
4141

4242
// Single LSH hashing
43-
val mhSingleHash = new MinHash()
44-
.setOutputDim(1)
43+
val mhSingleHash = new MinHashLSH()
4544
.setInputCol("keys")
4645
.setOutputCol("values")
4746
val modelSingleHash = mhSingleHash.fit(dataFrame)
4847
// Feature transformation: add a new column for a hash value
4948
modelSingleHash.transform(dataFrame).show()
5049

51-
// Use more than 1 hash functions
52-
val mh = new MinHash()
53-
.setOutputDim(5)
50+
// Use more than 1 hash tables
51+
val mh = new MinHashLSH()
52+
.setNumHashTables(5)
5453
.setInputCol("keys")
5554
.setOutputCol("values")
5655
val model = mh.fit(dataFrame)

0 commit comments

Comments
 (0)