Skip to content

Commit 25510f5

Browse files
author
bomeng
committed
Merge remote-tracking branch 'upstream/master' into SPARK-16004
2 parents 0fe835d + 63470af commit 25510f5

File tree

20 files changed

+461
-148
lines changed

20 files changed

+461
-148
lines changed

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
391391

392392
_conf.set("spark.executor.id", SparkContext.DRIVER_IDENTIFIER)
393393

394-
_jars = _conf.getOption("spark.jars").map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
394+
_jars = Utils.getUserJars(_conf)
395395
_files = _conf.getOption("spark.files").map(_.split(",")).map(_.filter(_.nonEmpty))
396396
.toSeq.flatten
397397

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2352,6 +2352,31 @@ private[spark] object Utils extends Logging {
23522352
log.info(s"Started daemon with process name: ${Utils.getProcessName()}")
23532353
SignalUtils.registerLogger(log)
23542354
}
2355+
2356+
/**
2357+
* Unions two comma-separated lists of files and filters out empty strings.
2358+
*/
2359+
def unionFileLists(leftList: Option[String], rightList: Option[String]): Set[String] = {
2360+
var allFiles = Set[String]()
2361+
leftList.foreach { value => allFiles ++= value.split(",") }
2362+
rightList.foreach { value => allFiles ++= value.split(",") }
2363+
allFiles.filter { _.nonEmpty }
2364+
}
2365+
2366+
/**
2367+
* In YARN mode this method returns a union of the jar files pointed by "spark.jars" and the
2368+
* "spark.yarn.dist.jars" properties, while in other modes it returns the jar files pointed by
2369+
* only the "spark.jars" property.
2370+
*/
2371+
def getUserJars(conf: SparkConf): Seq[String] = {
2372+
val sparkJars = conf.getOption("spark.jars")
2373+
if (conf.get("spark.master") == "yarn") {
2374+
val yarnJars = conf.getOption("spark.yarn.dist.jars")
2375+
unionFileLists(sparkJars, yarnJars).toSeq
2376+
} else {
2377+
sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
2378+
}
2379+
}
23552380
}
23562381

23572382
/**

core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,18 @@ class SparkSubmitSuite
570570
appArgs.executorMemory should be ("2.3g")
571571
}
572572
}
573+
574+
test("comma separated list of files are unioned correctly") {
575+
val left = Option("/tmp/a.jar,/tmp/b.jar")
576+
val right = Option("/tmp/c.jar,/tmp/a.jar")
577+
val emptyString = Option("")
578+
Utils.unionFileLists(left, right) should be (Set("/tmp/a.jar", "/tmp/b.jar", "/tmp/c.jar"))
579+
Utils.unionFileLists(emptyString, emptyString) should be (Set.empty)
580+
Utils.unionFileLists(Option("/tmp/a.jar"), emptyString) should be (Set("/tmp/a.jar"))
581+
Utils.unionFileLists(emptyString, Option("/tmp/a.jar")) should be (Set("/tmp/a.jar"))
582+
Utils.unionFileLists(None, Option("/tmp/a.jar")) should be (Set("/tmp/a.jar"))
583+
Utils.unionFileLists(Option("/tmp/a.jar"), None) should be (Set("/tmp/a.jar"))
584+
}
573585
// scalastyle:on println
574586

575587
// NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.

data/mllib/sample_isotonic_regression_data.txt

Lines changed: 0 additions & 100 deletions
This file was deleted.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
0.24579296 1:0.01
2+
0.28505864 1:0.02
3+
0.31208567 1:0.03
4+
0.35900051 1:0.04
5+
0.35747068 1:0.05
6+
0.16675166 1:0.06
7+
0.17491076 1:0.07
8+
0.04181540 1:0.08
9+
0.04793473 1:0.09
10+
0.03926568 1:0.10
11+
0.12952575 1:0.11
12+
0.00000000 1:0.12
13+
0.01376849 1:0.13
14+
0.13105558 1:0.14
15+
0.08873024 1:0.15
16+
0.12595614 1:0.16
17+
0.15247323 1:0.17
18+
0.25956145 1:0.18
19+
0.20040796 1:0.19
20+
0.19581846 1:0.20
21+
0.15757267 1:0.21
22+
0.13717491 1:0.22
23+
0.19020908 1:0.23
24+
0.19581846 1:0.24
25+
0.20091790 1:0.25
26+
0.16879143 1:0.26
27+
0.18510964 1:0.27
28+
0.20040796 1:0.28
29+
0.29576747 1:0.29
30+
0.43396226 1:0.30
31+
0.53391127 1:0.31
32+
0.52116267 1:0.32
33+
0.48546660 1:0.33
34+
0.49209587 1:0.34
35+
0.54156043 1:0.35
36+
0.59765426 1:0.36
37+
0.56144824 1:0.37
38+
0.58592555 1:0.38
39+
0.52983172 1:0.39
40+
0.50178480 1:0.40
41+
0.52626211 1:0.41
42+
0.58286588 1:0.42
43+
0.64660887 1:0.43
44+
0.68077511 1:0.44
45+
0.74298827 1:0.45
46+
0.64864865 1:0.46
47+
0.67261601 1:0.47
48+
0.65782764 1:0.48
49+
0.69811321 1:0.49
50+
0.63029067 1:0.50
51+
0.61601224 1:0.51
52+
0.63233044 1:0.52
53+
0.65323814 1:0.53
54+
0.65323814 1:0.54
55+
0.67363590 1:0.55
56+
0.67006629 1:0.56
57+
0.51555329 1:0.57
58+
0.50892402 1:0.58
59+
0.33299337 1:0.59
60+
0.36206017 1:0.60
61+
0.43090260 1:0.61
62+
0.45996940 1:0.62
63+
0.56348802 1:0.63
64+
0.54920959 1:0.64
65+
0.48393677 1:0.65
66+
0.48495665 1:0.66
67+
0.46965834 1:0.67
68+
0.45181030 1:0.68
69+
0.45843957 1:0.69
70+
0.47118817 1:0.70
71+
0.51555329 1:0.71
72+
0.58031617 1:0.72
73+
0.55481897 1:0.73
74+
0.56297807 1:0.74
75+
0.56603774 1:0.75
76+
0.57929628 1:0.76
77+
0.64762876 1:0.77
78+
0.66241713 1:0.78
79+
0.69301377 1:0.79
80+
0.65119837 1:0.80
81+
0.68332483 1:0.81
82+
0.66598674 1:0.82
83+
0.73890872 1:0.83
84+
0.73992861 1:0.84
85+
0.84242733 1:0.85
86+
0.91330954 1:0.86
87+
0.88016318 1:0.87
88+
0.90719021 1:0.88
89+
0.93115757 1:0.89
90+
0.93115757 1:0.90
91+
0.91942886 1:0.91
92+
0.92911780 1:0.92
93+
0.95665477 1:0.93
94+
0.95002550 1:0.94
95+
0.96940337 1:0.95
96+
1.00000000 1:0.96
97+
0.89801122 1:0.97
98+
0.90311066 1:0.98
99+
0.90362060 1:0.99
100+
0.83477817 1:1.0

docs/ml-classification-regression.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,76 @@ The implementation matches the result from R's survival function
691691
</div>
692692

693693

694+
## Isotonic regression
695+
[Isotonic regression](http://en.wikipedia.org/wiki/Isotonic_regression)
696+
belongs to the family of regression algorithms. Formally isotonic regression is a problem where
697+
given a finite set of real numbers `$Y = {y_1, y_2, ..., y_n}$` representing observed responses
698+
and `$X = {x_1, x_2, ..., x_n}$` the unknown response values to be fitted
699+
finding a function that minimises
700+
701+
`\begin{equation}
702+
f(x) = \sum_{i=1}^n w_i (y_i - x_i)^2
703+
\end{equation}`
704+
705+
with respect to complete order subject to
706+
`$x_1\le x_2\le ...\le x_n$` where `$w_i$` are positive weights.
707+
The resulting function is called isotonic regression and it is unique.
708+
It can be viewed as least squares problem under order restriction.
709+
Essentially isotonic regression is a
710+
[monotonic function](http://en.wikipedia.org/wiki/Monotonic_function)
711+
best fitting the original data points.
712+
713+
We implement a
714+
[pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
715+
which uses an approach to
716+
[parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
717+
The training input is a DataFrame which contains three columns
718+
label, features and weight. Additionally IsotonicRegression algorithm has one
719+
optional parameter called $isotonic$ defaulting to true.
720+
This argument specifies if the isotonic regression is
721+
isotonic (monotonically increasing) or antitonic (monotonically decreasing).
722+
723+
Training returns an IsotonicRegressionModel that can be used to predict
724+
labels for both known and unknown features. The result of isotonic regression
725+
is treated as piecewise linear function. The rules for prediction therefore are:
726+
727+
* If the prediction input exactly matches a training feature
728+
then associated prediction is returned. In case there are multiple predictions with the same
729+
feature then one of them is returned. Which one is undefined
730+
(same as java.util.Arrays.binarySearch).
731+
* If the prediction input is lower or higher than all training features
732+
then prediction with lowest or highest feature is returned respectively.
733+
In case there are multiple predictions with the same feature
734+
then the lowest or highest is returned respectively.
735+
* If the prediction input falls between two training features then prediction is treated
736+
as piecewise linear function and interpolated value is calculated from the
737+
predictions of the two closest features. In case there are multiple values
738+
with the same feature then the same rules as in previous point are used.
739+
740+
### Examples
741+
742+
<div class="codetabs">
743+
<div data-lang="scala" markdown="1">
744+
745+
Refer to the [`IsotonicRegression` Scala docs](api/scala/index.html#org.apache.spark.ml.regression.IsotonicRegression) for details on the API.
746+
747+
{% include_example scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala %}
748+
</div>
749+
<div data-lang="java" markdown="1">
750+
751+
Refer to the [`IsotonicRegression` Java docs](api/java/org/apache/spark/ml/regression/IsotonicRegression.html) for details on the API.
752+
753+
{% include_example java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java %}
754+
</div>
755+
<div data-lang="python" markdown="1">
756+
757+
Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.ml.html#pyspark.ml.regression.IsotonicRegression) for more details on the API.
758+
759+
{% include_example python/ml/isotonic_regression_example.py %}
760+
</div>
761+
</div>
762+
763+
694764

695765
# Decision trees
696766

docs/monitoring.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ The history server can be configured as follows:
157157
If enabled, access control checks are made regardless of what the individual application had
158158
set for <code>spark.ui.acls.enable</code> when the application was run. The application owner
159159
will always have authorization to view their own application and any users specified via
160-
<code>spark.ui.view.acls</code> and groups specified via <code>spark.ui.view.acls.groups<code>
160+
<code>spark.ui.view.acls</code> and groups specified via <code>spark.ui.view.acls.groups</code>
161161
when the application was run will also have authorization to view that application.
162162
If disabled, no access control checks are made.
163163
</td>

0 commit comments

Comments
 (0)