Skip to content

Commit dd9ca7b

Browse files
keypointtsrowen
authored andcommitted
[SPARK-13019][DOCS] fix for scala-2.10 build: Replace example code in mllib-statistics.md using include_example
## What changes were proposed in this pull request? This PR for ticket SPARK-13019 is based on previous PR(#11108). Since PR(#11108) is breaking scala-2.10 build, more work is needed to fix build errors. What I did new in this PR is adding keyword argument for 'fractions': ` val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions)` ` val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)` I reopened ticket on JIRA but sorry I don't know how to reopen a GitHub pull request, so I just submitting a new pull request. ## How was this patch tested? Manual build testing on local machine, build based on scala-2.10. Author: Xin Ren <[email protected]> Closes #11901 from keypointt/SPARK-13019.
1 parent 048a759 commit dd9ca7b

19 files changed

+1076
-382
lines changed

docs/mllib-statistics.md

Lines changed: 56 additions & 382 deletions
Large diffs are not rendered by default.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
// $example on$
23+
import java.util.Arrays;
24+
25+
import org.apache.spark.api.java.JavaDoubleRDD;
26+
import org.apache.spark.api.java.JavaRDD;
27+
import org.apache.spark.mllib.linalg.Matrix;
28+
import org.apache.spark.mllib.linalg.Vector;
29+
import org.apache.spark.mllib.linalg.Vectors;
30+
import org.apache.spark.mllib.stat.Statistics;
31+
// $example off$
32+
33+
public class JavaCorrelationsExample {
34+
public static void main(String[] args) {
35+
36+
SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
37+
JavaSparkContext jsc = new JavaSparkContext(conf);
38+
39+
// $example on$
40+
JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
41+
Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series
42+
43+
// must have the same number of partitions and cardinality as seriesX
44+
JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
45+
Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));
46+
47+
// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
48+
// If a method is not specified, Pearson's method will be used by default.
49+
Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
50+
System.out.println("Correlation is: " + correlation);
51+
52+
// note that each Vector is a row and not a column
53+
JavaRDD<Vector> data = jsc.parallelize(
54+
Arrays.asList(
55+
Vectors.dense(1.0, 10.0, 100.0),
56+
Vectors.dense(2.0, 20.0, 200.0),
57+
Vectors.dense(5.0, 33.0, 366.0)
58+
)
59+
);
60+
61+
// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
62+
// If a method is not specified, Pearson's method will be used by default.
63+
Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
64+
System.out.println(correlMatrix.toString());
65+
// $example off$
66+
67+
jsc.stop();
68+
}
69+
}
70+
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
23+
// $example on$
24+
import java.util.Arrays;
25+
26+
import org.apache.spark.api.java.JavaRDD;
27+
import org.apache.spark.mllib.linalg.Matrices;
28+
import org.apache.spark.mllib.linalg.Matrix;
29+
import org.apache.spark.mllib.linalg.Vector;
30+
import org.apache.spark.mllib.linalg.Vectors;
31+
import org.apache.spark.mllib.regression.LabeledPoint;
32+
import org.apache.spark.mllib.stat.Statistics;
33+
import org.apache.spark.mllib.stat.test.ChiSqTestResult;
34+
// $example off$
35+
36+
public class JavaHypothesisTestingExample {
37+
public static void main(String[] args) {
38+
39+
SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
40+
JavaSparkContext jsc = new JavaSparkContext(conf);
41+
42+
// $example on$
43+
// a vector composed of the frequencies of events
44+
Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);
45+
46+
// compute the goodness of fit. If a second vector to test against is not supplied
47+
// as a parameter, the test runs against a uniform distribution.
48+
ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
49+
// summary of the test including the p-value, degrees of freedom, test statistic,
50+
// the method used, and the null hypothesis.
51+
System.out.println(goodnessOfFitTestResult + "\n");
52+
53+
// Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
54+
Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
55+
56+
// conduct Pearson's independence test on the input contingency matrix
57+
ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
58+
// summary of the test including the p-value, degrees of freedom...
59+
System.out.println(independenceTestResult + "\n");
60+
61+
// an RDD of labeled points
62+
JavaRDD<LabeledPoint> obs = jsc.parallelize(
63+
Arrays.asList(
64+
new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
65+
new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
66+
new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
67+
)
68+
);
69+
70+
// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
71+
// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
72+
// against the label.
73+
ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
74+
int i = 1;
75+
for (ChiSqTestResult result : featureTestResults) {
76+
System.out.println("Column " + i + ":");
77+
System.out.println(result + "\n"); // summary of the test
78+
i++;
79+
}
80+
// $example off$
81+
82+
jsc.stop();
83+
}
84+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
// $example on$
23+
import java.util.Arrays;
24+
25+
import org.apache.spark.api.java.JavaDoubleRDD;
26+
import org.apache.spark.mllib.stat.Statistics;
27+
import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
28+
// $example off$
29+
30+
public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
31+
public static void main(String[] args) {
32+
33+
SparkConf conf =
34+
new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
35+
JavaSparkContext jsc = new JavaSparkContext(conf);
36+
37+
// $example on$
38+
JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
39+
KolmogorovSmirnovTestResult testResult =
40+
Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
41+
// summary of the test including the p-value, test statistic, and null hypothesis
42+
// if our p-value indicates significance, we can reject the null hypothesis
43+
System.out.println(testResult);
44+
// $example off$
45+
46+
jsc.stop();
47+
}
48+
}
49+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
// $example on$
23+
import java.util.Arrays;
24+
25+
import org.apache.spark.api.java.JavaRDD;
26+
import org.apache.spark.mllib.stat.KernelDensity;
27+
// $example off$
28+
29+
public class JavaKernelDensityEstimationExample {
30+
public static void main(String[] args) {
31+
32+
SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
33+
JavaSparkContext jsc = new JavaSparkContext(conf);
34+
35+
// $example on$
36+
// an RDD of sample data
37+
JavaRDD<Double> data = jsc.parallelize(
38+
Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));
39+
40+
// Construct the density estimator with the sample data
41+
// and a standard deviation for the Gaussian kernels
42+
KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);
43+
44+
// Find density estimates for the given values
45+
double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
46+
47+
System.out.println(Arrays.toString(densities));
48+
// $example off$
49+
50+
jsc.stop();
51+
}
52+
}
53+
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
import com.google.common.collect.ImmutableMap;
21+
import org.apache.spark.SparkConf;
22+
import org.apache.spark.api.java.JavaSparkContext;
23+
24+
// $example on$
25+
import java.util.*;
26+
27+
import scala.Tuple2;
28+
29+
import org.apache.spark.api.java.JavaPairRDD;
30+
import org.apache.spark.api.java.function.VoidFunction;
31+
// $example off$
32+
33+
public class JavaStratifiedSamplingExample {
34+
public static void main(String[] args) {
35+
36+
SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
37+
JavaSparkContext jsc = new JavaSparkContext(conf);
38+
39+
// $example on$
40+
List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>(
41+
Arrays.<Tuple2<Integer, Character>>asList(
42+
new Tuple2(1, 'a'),
43+
new Tuple2(1, 'b'),
44+
new Tuple2(2, 'c'),
45+
new Tuple2(2, 'd'),
46+
new Tuple2(2, 'e'),
47+
new Tuple2(3, 'f')
48+
)
49+
);
50+
51+
JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
52+
53+
// specify the exact fraction desired from each key Map<K, Object>
54+
ImmutableMap<Integer, Object> fractions =
55+
ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);
56+
57+
// Get an approximate sample from each stratum
58+
JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
59+
// Get an exact sample from each stratum
60+
JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
61+
// $example off$
62+
63+
System.out.println("approxSample size is " + approxSample.collect().size());
64+
for (Tuple2<Integer, Character> t : approxSample.collect()) {
65+
System.out.println(t._1() + " " + t._2());
66+
}
67+
68+
System.out.println("exactSample size is " + exactSample.collect().size());
69+
for (Tuple2<Integer, Character> t : exactSample.collect()) {
70+
System.out.println(t._1() + " " + t._2());
71+
}
72+
73+
jsc.stop();
74+
}
75+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
// $example on$
23+
import java.util.Arrays;
24+
25+
import org.apache.spark.api.java.JavaRDD;
26+
import org.apache.spark.mllib.linalg.Vector;
27+
import org.apache.spark.mllib.linalg.Vectors;
28+
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
29+
import org.apache.spark.mllib.stat.Statistics;
30+
// $example off$
31+
32+
public class JavaSummaryStatisticsExample {
33+
public static void main(String[] args) {
34+
35+
SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
36+
JavaSparkContext jsc = new JavaSparkContext(conf);
37+
38+
// $example on$
39+
JavaRDD<Vector> mat = jsc.parallelize(
40+
Arrays.asList(
41+
Vectors.dense(1.0, 10.0, 100.0),
42+
Vectors.dense(2.0, 20.0, 200.0),
43+
Vectors.dense(3.0, 30.0, 300.0)
44+
)
45+
); // an RDD of Vectors
46+
47+
// Compute column summary statistics.
48+
MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
49+
System.out.println(summary.mean()); // a dense vector containing the mean value for each column
50+
System.out.println(summary.variance()); // column-wise variance
51+
System.out.println(summary.numNonzeros()); // number of nonzeros in each column
52+
// $example off$
53+
54+
jsc.stop();
55+
}
56+
}

0 commit comments

Comments
 (0)