Skip to content

Commit c268ca4

Browse files
y-shimizumengxr
authored andcommitted
[SPARK-10518] [DOCS] Update code examples in spark.ml user guide to use LIBSVM data source instead of MLUtils
I fixed to use LIBSVM data source in the example code in spark.ml instead of MLUtils Author: y-shimizu <[email protected]> Closes #8697 from y-shimizu/SPARK-10518.
1 parent 9bbe33f commit c268ca4

File tree

3 files changed

+47
-104
lines changed

3 files changed

+47
-104
lines changed

docs/ml-ensembles.md

Lines changed: 19 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,9 @@ import org.apache.spark.ml.classification.RandomForestClassifier
121121
import org.apache.spark.ml.classification.RandomForestClassificationModel
122122
import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
123123
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
124-
import org.apache.spark.mllib.util.MLUtils
125124

126125
// Load and parse the data file, converting it to a DataFrame.
127-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
126+
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
128127

129128
// Index labels, adding metadata to the label column.
130129
// Fit on whole dataset to include all labels in index.
@@ -193,14 +192,11 @@ import org.apache.spark.ml.classification.RandomForestClassifier;
193192
import org.apache.spark.ml.classification.RandomForestClassificationModel;
194193
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
195194
import org.apache.spark.ml.feature.*;
196-
import org.apache.spark.mllib.regression.LabeledPoint;
197-
import org.apache.spark.mllib.util.MLUtils;
198-
import org.apache.spark.rdd.RDD;
199195
import org.apache.spark.sql.DataFrame;
200196

201197
// Load and parse the data file, converting it to a DataFrame.
202-
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
203-
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
198+
DataFrame data = sqlContext.read.format("libsvm")
199+
.load("data/mllib/sample_libsvm_data.txt");
204200

205201
// Index labels, adding metadata to the label column.
206202
// Fit on whole dataset to include all labels in index.
@@ -268,10 +264,9 @@ from pyspark.ml import Pipeline
268264
from pyspark.ml.classification import RandomForestClassifier
269265
from pyspark.ml.feature import StringIndexer, VectorIndexer
270266
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
271-
from pyspark.mllib.util import MLUtils
272267

273268
# Load and parse the data file, converting it to a DataFrame.
274-
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
269+
data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
275270

276271
# Index labels, adding metadata to the label column.
277272
# Fit on whole dataset to include all labels in index.
@@ -327,10 +322,9 @@ import org.apache.spark.ml.regression.RandomForestRegressor
327322
import org.apache.spark.ml.regression.RandomForestRegressionModel
328323
import org.apache.spark.ml.feature.VectorIndexer
329324
import org.apache.spark.ml.evaluation.RegressionEvaluator
330-
import org.apache.spark.mllib.util.MLUtils
331325

332326
// Load and parse the data file, converting it to a DataFrame.
333-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
327+
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
334328

335329
// Automatically identify categorical features, and index them.
336330
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -387,14 +381,11 @@ import org.apache.spark.ml.feature.VectorIndexer;
387381
import org.apache.spark.ml.feature.VectorIndexerModel;
388382
import org.apache.spark.ml.regression.RandomForestRegressionModel;
389383
import org.apache.spark.ml.regression.RandomForestRegressor;
390-
import org.apache.spark.mllib.regression.LabeledPoint;
391-
import org.apache.spark.mllib.util.MLUtils;
392-
import org.apache.spark.rdd.RDD;
393384
import org.apache.spark.sql.DataFrame;
394385

395386
// Load and parse the data file, converting it to a DataFrame.
396-
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
397-
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
387+
DataFrame data = sqlContext.read.format("libsvm")
388+
.load("data/mllib/sample_libsvm_data.txt");
398389

399390
// Automatically identify categorical features, and index them.
400391
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -450,10 +441,9 @@ from pyspark.ml import Pipeline
450441
from pyspark.ml.regression import RandomForestRegressor
451442
from pyspark.ml.feature import VectorIndexer
452443
from pyspark.ml.evaluation import RegressionEvaluator
453-
from pyspark.mllib.util import MLUtils
454444

455445
# Load and parse the data file, converting it to a DataFrame.
456-
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
446+
data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
457447

458448
# Automatically identify categorical features, and index them.
459449
# Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -576,10 +566,9 @@ import org.apache.spark.ml.classification.GBTClassifier
576566
import org.apache.spark.ml.classification.GBTClassificationModel
577567
import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
578568
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
579-
import org.apache.spark.mllib.util.MLUtils
580569

581570
// Load and parse the data file, converting it to a DataFrame.
582-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
571+
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
583572

584573
// Index labels, adding metadata to the label column.
585574
// Fit on whole dataset to include all labels in index.
@@ -648,14 +637,10 @@ import org.apache.spark.ml.classification.GBTClassifier;
648637
import org.apache.spark.ml.classification.GBTClassificationModel;
649638
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
650639
import org.apache.spark.ml.feature.*;
651-
import org.apache.spark.mllib.regression.LabeledPoint;
652-
import org.apache.spark.mllib.util.MLUtils;
653-
import org.apache.spark.rdd.RDD;
654640
import org.apache.spark.sql.DataFrame;
655641

656642
// Load and parse the data file, converting it to a DataFrame.
657-
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
658-
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
643+
DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
659644

660645
// Index labels, adding metadata to the label column.
661646
// Fit on whole dataset to include all labels in index.
@@ -724,10 +709,9 @@ from pyspark.ml import Pipeline
724709
from pyspark.ml.classification import GBTClassifier
725710
from pyspark.ml.feature import StringIndexer, VectorIndexer
726711
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
727-
from pyspark.mllib.util import MLUtils
728712

729713
# Load and parse the data file, converting it to a DataFrame.
730-
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
714+
data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
731715

732716
# Index labels, adding metadata to the label column.
733717
# Fit on whole dataset to include all labels in index.
@@ -783,10 +767,9 @@ import org.apache.spark.ml.regression.GBTRegressor
783767
import org.apache.spark.ml.regression.GBTRegressionModel
784768
import org.apache.spark.ml.feature.VectorIndexer
785769
import org.apache.spark.ml.evaluation.RegressionEvaluator
786-
import org.apache.spark.mllib.util.MLUtils
787770

788771
// Load and parse the data file, converting it to a DataFrame.
789-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
772+
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
790773

791774
// Automatically identify categorical features, and index them.
792775
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -844,14 +827,10 @@ import org.apache.spark.ml.feature.VectorIndexer;
844827
import org.apache.spark.ml.feature.VectorIndexerModel;
845828
import org.apache.spark.ml.regression.GBTRegressionModel;
846829
import org.apache.spark.ml.regression.GBTRegressor;
847-
import org.apache.spark.mllib.regression.LabeledPoint;
848-
import org.apache.spark.mllib.util.MLUtils;
849-
import org.apache.spark.rdd.RDD;
850830
import org.apache.spark.sql.DataFrame;
851831

852832
// Load and parse the data file, converting it to a DataFrame.
853-
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
854-
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
833+
DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
855834

856835
// Automatically identify categorical features, and index them.
857836
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -908,10 +887,9 @@ from pyspark.ml import Pipeline
908887
from pyspark.ml.regression import GBTRegressor
909888
from pyspark.ml.feature import VectorIndexer
910889
from pyspark.ml.evaluation import RegressionEvaluator
911-
from pyspark.mllib.util import MLUtils
912890

913891
# Load and parse the data file, converting it to a DataFrame.
914-
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
892+
data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
915893

916894
# Automatically identify categorical features, and index them.
917895
# Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -970,15 +948,14 @@ Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifie
970948
{% highlight scala %}
971949
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
972950
import org.apache.spark.mllib.evaluation.MulticlassMetrics
973-
import org.apache.spark.mllib.util.MLUtils
974951
import org.apache.spark.sql.{Row, SQLContext}
975952

976953
val sqlContext = new SQLContext(sc)
977954

978955
// parse data into dataframe
979-
val data = MLUtils.loadLibSVMFile(sc,
980-
"data/mllib/sample_multiclass_classification_data.txt")
981-
val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3))
956+
val data = sqlContext.read.format("libsvm")
957+
.load("data/mllib/sample_multiclass_classification_data.txt")
958+
val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
982959

983960
// instantiate multiclass learner and train
984961
val ovr = new OneVsRest().setClassifier(new LogisticRegression)
@@ -1016,20 +993,16 @@ import org.apache.spark.ml.classification.OneVsRest;
1016993
import org.apache.spark.ml.classification.OneVsRestModel;
1017994
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
1018995
import org.apache.spark.mllib.linalg.Matrix;
1019-
import org.apache.spark.mllib.regression.LabeledPoint;
1020-
import org.apache.spark.mllib.util.MLUtils;
1021-
import org.apache.spark.rdd.RDD;
1022996
import org.apache.spark.sql.DataFrame;
1023997
import org.apache.spark.sql.SQLContext;
1024998

1025999
SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
10261000
JavaSparkContext jsc = new JavaSparkContext(conf);
10271001
SQLContext jsql = new SQLContext(jsc);
10281002

1029-
RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(),
1030-
"data/mllib/sample_multiclass_classification_data.txt");
1003+
DataFrame dataFrame = sqlContext.read.format("libsvm")
1004+
.load("data/mllib/sample_multiclass_classification_data.txt");
10311005

1032-
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
10331006
DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
10341007
DataFrame train = splits[0];
10351008
DataFrame test = splits[1];

docs/ml-features.md

Lines changed: 22 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,9 +1179,9 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
11791179
<div data-lang="scala" markdown="1">
11801180
{% highlight scala %}
11811181
import org.apache.spark.ml.feature.VectorIndexer
1182-
import org.apache.spark.mllib.util.MLUtils
11831182

1184-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
1183+
val data = sqlContext.read.format("libsvm")
1184+
.load("data/mllib/sample_libsvm_data.txt")
11851185
val indexer = new VectorIndexer()
11861186
.setInputCol("features")
11871187
.setOutputCol("indexed")
@@ -1200,16 +1200,12 @@ val indexedData = indexerModel.transform(data)
12001200
{% highlight java %}
12011201
import java.util.Map;
12021202

1203-
import org.apache.spark.api.java.JavaRDD;
12041203
import org.apache.spark.ml.feature.VectorIndexer;
12051204
import org.apache.spark.ml.feature.VectorIndexerModel;
1206-
import org.apache.spark.mllib.regression.LabeledPoint;
1207-
import org.apache.spark.mllib.util.MLUtils;
12081205
import org.apache.spark.sql.DataFrame;
12091206

1210-
JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(),
1211-
"data/mllib/sample_libsvm_data.txt").toJavaRDD();
1212-
DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
1207+
DataFrame data = sqlContext.read.format("libsvm")
1208+
.load("data/mllib/sample_libsvm_data.txt");
12131209
VectorIndexer indexer = new VectorIndexer()
12141210
.setInputCol("features")
12151211
.setOutputCol("indexed")
@@ -1230,9 +1226,9 @@ DataFrame indexedData = indexerModel.transform(data);
12301226
<div data-lang="python" markdown="1">
12311227
{% highlight python %}
12321228
from pyspark.ml.feature import VectorIndexer
1233-
from pyspark.mllib.util import MLUtils
12341229

1235-
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
1230+
data = sqlContext.read.format("libsvm")
1231+
.load("data/mllib/sample_libsvm_data.txt")
12361232
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
12371233
indexerModel = indexer.fit(data)
12381234

@@ -1253,10 +1249,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
12531249
<div data-lang="scala">
12541250
{% highlight scala %}
12551251
import org.apache.spark.ml.feature.Normalizer
1256-
import org.apache.spark.mllib.util.MLUtils
12571252

1258-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
1259-
val dataFrame = sqlContext.createDataFrame(data)
1253+
val dataFrame = sqlContext.read.format("libsvm")
1254+
.load("data/mllib/sample_libsvm_data.txt")
12601255

12611256
// Normalize each Vector using $L^1$ norm.
12621257
val normalizer = new Normalizer()
@@ -1272,15 +1267,11 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
12721267

12731268
<div data-lang="java">
12741269
{% highlight java %}
1275-
import org.apache.spark.api.java.JavaRDD;
12761270
import org.apache.spark.ml.feature.Normalizer;
1277-
import org.apache.spark.mllib.regression.LabeledPoint;
1278-
import org.apache.spark.mllib.util.MLUtils;
12791271
import org.apache.spark.sql.DataFrame;
12801272

1281-
JavaRDD<LabeledPoint> data =
1282-
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
1283-
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
1273+
DataFrame dataFrame = sqlContext.read.format("libsvm")
1274+
.load("data/mllib/sample_libsvm_data.txt");
12841275

12851276
// Normalize each Vector using $L^1$ norm.
12861277
Normalizer normalizer = new Normalizer()
@@ -1297,11 +1288,10 @@ DataFrame lInfNormData =
12971288

12981289
<div data-lang="python">
12991290
{% highlight python %}
1300-
from pyspark.mllib.util import MLUtils
13011291
from pyspark.ml.feature import Normalizer
13021292

1303-
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
1304-
dataFrame = sqlContext.createDataFrame(data)
1293+
dataFrame = sqlContext.read.format("libsvm")
1294+
.load("data/mllib/sample_libsvm_data.txt")
13051295

13061296
# Normalize each Vector using $L^1$ norm.
13071297
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
@@ -1335,10 +1325,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
13351325
<div data-lang="scala">
13361326
{% highlight scala %}
13371327
import org.apache.spark.ml.feature.StandardScaler
1338-
import org.apache.spark.mllib.util.MLUtils
13391328

1340-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
1341-
val dataFrame = sqlContext.createDataFrame(data)
1329+
val dataFrame = sqlContext.read.format("libsvm")
1330+
.load("data/mllib/sample_libsvm_data.txt")
13421331
val scaler = new StandardScaler()
13431332
.setInputCol("features")
13441333
.setOutputCol("scaledFeatures")
@@ -1355,16 +1344,12 @@ val scaledData = scalerModel.transform(dataFrame)
13551344

13561345
<div data-lang="java">
13571346
{% highlight java %}
1358-
import org.apache.spark.api.java.JavaRDD;
13591347
import org.apache.spark.ml.feature.StandardScaler;
13601348
import org.apache.spark.ml.feature.StandardScalerModel;
1361-
import org.apache.spark.mllib.regression.LabeledPoint;
1362-
import org.apache.spark.mllib.util.MLUtils;
13631349
import org.apache.spark.sql.DataFrame;
13641350

1365-
JavaRDD<LabeledPoint> data =
1366-
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
1367-
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
1351+
DataFrame dataFrame = sqlContext.read.format("libsvm")
1352+
.load("data/mllib/sample_libsvm_data.txt");
13681353
StandardScaler scaler = new StandardScaler()
13691354
.setInputCol("features")
13701355
.setOutputCol("scaledFeatures")
@@ -1381,11 +1366,10 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
13811366

13821367
<div data-lang="python">
13831368
{% highlight python %}
1384-
from pyspark.mllib.util import MLUtils
13851369
from pyspark.ml.feature import StandardScaler
13861370

1387-
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
1388-
dataFrame = sqlContext.createDataFrame(data)
1371+
dataFrame = sqlContext.read.format("libsvm")
1372+
.load("data/mllib/sample_libsvm_data.txt")
13891373
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
13901374
withStd=True, withMean=False)
13911375

@@ -1424,10 +1408,9 @@ More details can be found in the API docs for
14241408
[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
14251409
{% highlight scala %}
14261410
import org.apache.spark.ml.feature.MinMaxScaler
1427-
import org.apache.spark.mllib.util.MLUtils
14281411

1429-
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
1430-
val dataFrame = sqlContext.createDataFrame(data)
1412+
val dataFrame = sqlContext.read.format("libsvm")
1413+
.load("data/mllib/sample_libsvm_data.txt")
14311414
val scaler = new MinMaxScaler()
14321415
.setInputCol("features")
14331416
.setOutputCol("scaledFeatures")
@@ -1448,13 +1431,10 @@ More details can be found in the API docs for
14481431
import org.apache.spark.api.java.JavaRDD;
14491432
import org.apache.spark.ml.feature.MinMaxScaler;
14501433
import org.apache.spark.ml.feature.MinMaxScalerModel;
1451-
import org.apache.spark.mllib.regression.LabeledPoint;
1452-
import org.apache.spark.mllib.util.MLUtils;
14531434
import org.apache.spark.sql.DataFrame;
14541435

1455-
JavaRDD<LabeledPoint> data =
1456-
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
1457-
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
1436+
DataFrame dataFrame = sqlContext.read.format("libsvm")
1437+
.load("data/mllib/sample_libsvm_data.txt");
14581438
MinMaxScaler scaler = new MinMaxScaler()
14591439
.setInputCol("features")
14601440
.setOutputCol("scaledFeatures");

0 commit comments

Comments
 (0)