Skip to content

Commit 2ec848a

Browse files
committed
Merge branch 'master' of git://git.apache.org/spark into metrics-structure-improvement
2 parents 3ea7896 + 0a7091e commit 2ec848a

File tree

33 files changed

+1187
-89
lines changed

33 files changed

+1187
-89
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ can be run using:
7575
./dev/run-tests
7676

7777
Please see the guidance on how to
78-
[run all automated tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting)
78+
[run all automated tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting).
7979

8080
## A Note About Hadoop Versions
8181

core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,7 @@ private[spark] object PythonRDD extends Logging {
776776
}
777777

778778
/**
779-
* Convert and RDD of Java objects to and RDD of serialized Python objects, that is usable by
779+
* Convert an RDD of Java objects to an RDD of serialized Python objects, that is usable by
780780
* PySpark.
781781
*/
782782
def javaToPython(jRDD: JavaRDD[Any]): JavaRDD[Array[Byte]] = {

core/src/main/scala/org/apache/spark/network/ManagedBuffer.scala

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.network
1919

2020
import java.io.{FileInputStream, RandomAccessFile, File, InputStream}
2121
import java.nio.ByteBuffer
22+
import java.nio.channels.FileChannel
2223
import java.nio.channels.FileChannel.MapMode
2324

2425
import com.google.common.io.ByteStreams
@@ -66,8 +67,15 @@ final class FileSegmentManagedBuffer(val file: File, val offset: Long, val lengt
6667
override def size: Long = length
6768

6869
override def nioByteBuffer(): ByteBuffer = {
69-
val channel = new RandomAccessFile(file, "r").getChannel
70-
channel.map(MapMode.READ_ONLY, offset, length)
70+
var channel: FileChannel = null
71+
try {
72+
channel = new RandomAccessFile(file, "r").getChannel
73+
channel.map(MapMode.READ_ONLY, offset, length)
74+
} finally {
75+
if (channel != null) {
76+
channel.close()
77+
}
78+
}
7179
}
7280

7381
override def inputStream(): InputStream = {

core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer
2323
import scala.collection.mutable.HashSet
2424
import scala.collection.mutable.Queue
2525

26-
import org.apache.spark.{TaskContext, Logging, SparkException}
26+
import org.apache.spark.{TaskContext, Logging}
2727
import org.apache.spark.network.{ManagedBuffer, BlockFetchingListener, BlockTransferService}
2828
import org.apache.spark.serializer.Serializer
2929
import org.apache.spark.util.Utils

core/src/test/scala/org/apache/spark/ui/UISuite.scala

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import javax.servlet.http.HttpServletRequest
2323
import scala.io.Source
2424
import scala.util.{Failure, Success, Try}
2525

26-
import org.eclipse.jetty.server.Server
2726
import org.eclipse.jetty.servlet.ServletContextHandler
2827
import org.scalatest.FunSuite
2928
import org.scalatest.concurrent.Eventually._
@@ -108,14 +107,8 @@ class UISuite extends FunSuite {
108107
}
109108

110109
test("jetty selects different port under contention") {
111-
val startPort = 4040
112-
val server = new Server(startPort)
113-
114-
Try { server.start() } match {
115-
case Success(s) =>
116-
case Failure(e) =>
117-
// Either case server port is busy hence setup for test complete
118-
}
110+
val server = new ServerSocket(0)
111+
val startPort = server.getLocalPort
119112
val serverInfo1 = JettyUtils.startJettyServer(
120113
"0.0.0.0", startPort, Seq[ServletContextHandler](), new SparkConf)
121114
val serverInfo2 = JettyUtils.startJettyServer(
@@ -126,6 +119,9 @@ class UISuite extends FunSuite {
126119
assert(boundPort1 != startPort)
127120
assert(boundPort2 != startPort)
128121
assert(boundPort1 != boundPort2)
122+
serverInfo1.server.stop()
123+
serverInfo2.server.stop()
124+
server.close()
129125
}
130126

131127
test("jetty binds to port 0 correctly") {

docs/_layouts/global.html

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
<li class="divider"></li>
112112
<li><a href="building-spark.html">Building Spark</a></li>
113113
<li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
114+
<li><a href="https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects">Supplemental Projects</a></li>
114115
</ul>
115116
</li>
116117
</ul>
@@ -151,7 +152,7 @@ <h1 class="title">{{ page.title }}</h1>
151152
MathJax.Hub.Config({
152153
tex2jax: {
153154
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
154-
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
155+
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
155156
processEscapes: true,
156157
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
157158
}

docs/ec2-scripts.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,6 @@ If you have a patch or suggestion for one of these limitations, feel free to
156156

157157
# Accessing Data in S3
158158

159-
Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n://<bucket>/path`. You will also need to set your Amazon security credentials, either by setting the environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` before your program or through `SparkContext.hadoopConfiguration`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3).
159+
Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n://<bucket>/path`. To provide AWS credentials for S3 access, launch the Spark cluster with the option `--copy-aws-credentials`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3).
160160

161161
In addition to using a single input file, you can also use a directory of files as input by simply giving the path to the directory.

docs/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ options for deployment:
107107
* [OpenStack Swift](storage-openstack-swift.html)
108108
* [Building Spark](building-spark.html): build Spark using the Maven system
109109
* [Contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark)
110+
* [Supplemental Projects](https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects): related third party Spark projects
110111

111112
**External Resources:**
112113

docs/sql-programming-guide.md

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ feature parity with a HiveContext.
128128

129129
</div>
130130

131-
The specific variant of SQL that is used to parse queries can also be selected using the
131+
The specific variant of SQL that is used to parse queries can also be selected using the
132132
`spark.sql.dialect` option. This parameter can be changed using either the `setConf` method on
133133
a SQLContext or by using a `SET key=value` command in SQL. For a SQLContext, the only dialect
134134
available is "sql" which uses a simple SQL parser provided by Spark SQL. In a HiveContext, the
@@ -139,7 +139,7 @@ default is "hiveql", though "sql" is also available. Since the HiveQL parser is
139139

140140
Spark SQL supports operating on a variety of data sources through the `SchemaRDD` interface.
141141
A SchemaRDD can be operated on as normal RDDs and can also be registered as a temporary table.
142-
Registering a SchemaRDD as a table allows you to run SQL queries over its data. This section
142+
Registering a SchemaRDD as a table allows you to run SQL queries over its data. This section
143143
describes the various methods for loading data into a SchemaRDD.
144144

145145
## RDDs
@@ -152,7 +152,7 @@ while writing your Spark application.
152152
The second method for creating SchemaRDDs is through a programmatic interface that allows you to
153153
construct a schema and then apply it to an existing RDD. While this method is more verbose, it allows
154154
you to construct SchemaRDDs when the columns and their types are not known until runtime.
155-
155+
156156
### Inferring the Schema Using Reflection
157157
<div class="codetabs">
158158

@@ -193,7 +193,7 @@ teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
193193
<div data-lang="java" markdown="1">
194194

195195
Spark SQL supports automatically converting an RDD of [JavaBeans](http://stackoverflow.com/questions/3295496/what-is-a-javabean-exactly)
196-
into a Schema RDD. The BeanInfo, obtained using reflection, defines the schema of the table.
196+
into a Schema RDD. The BeanInfo, obtained using reflection, defines the schema of the table.
197197
Currently, Spark SQL does not support JavaBeans that contain
198198
nested or contain complex types such as Lists or Arrays. You can create a JavaBean by creating a
199199
class that implements Serializable and has getters and setters for all of its fields.
@@ -480,7 +480,7 @@ for name in names.collect():
480480

481481
[Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems.
482482
Spark SQL provides support for both reading and writing Parquet files that automatically preserves the schema
483-
of the original data.
483+
of the original data.
484484

485485
### Loading Data Programmatically
486486

@@ -562,7 +562,7 @@ for teenName in teenNames.collect():
562562

563563
</div>
564564

565-
</div>
565+
</div>
566566

567567
### Configuration
568568

@@ -808,7 +808,7 @@ memory usage and GC pressure. You can call `uncacheTable("tableName")` to remove
808808
Note that if you call `cache` rather than `cacheTable`, tables will _not_ be cached using
809809
the in-memory columnar format, and therefore `cacheTable` is strongly recommended for this use case.
810810

811-
Configuration of in-memory caching can be done using the `setConf` method on SQLContext or by running
811+
Configuration of in-memory caching can be done using the `setConf` method on SQLContext or by running
812812
`SET key=value` commands using SQL.
813813

814814
<table class="table">
@@ -881,10 +881,32 @@ To start the JDBC server, run the following in the Spark directory:
881881

882882
./sbin/start-thriftserver.sh
883883

884-
The default port the server listens on is 10000. To listen on customized host and port, please set
885-
the `HIVE_SERVER2_THRIFT_PORT` and `HIVE_SERVER2_THRIFT_BIND_HOST` environment variables. You may
886-
run `./sbin/start-thriftserver.sh --help` for a complete list of all available options. Now you can
887-
use beeline to test the Thrift JDBC server:
884+
This script accepts all `bin/spark-submit` command line options, plus a `--hiveconf` option to
885+
specify Hive properties. You may run `./sbin/start-thriftserver.sh --help` for a complete list of
886+
all available options. By default, the server listens on localhost:10000. You may override this
887+
bahaviour via either environment variables, i.e.:
888+
889+
{% highlight bash %}
890+
export HIVE_SERVER2_THRIFT_PORT=<listening-port>
891+
export HIVE_SERVER2_THRIFT_BIND_HOST=<listening-host>
892+
./sbin/start-thriftserver.sh \
893+
--master <master-uri> \
894+
...
895+
```
896+
{% endhighlight %}
897+
898+
or system properties:
899+
900+
{% highlight bash %}
901+
./sbin/start-thriftserver.sh \
902+
--hiveconf hive.server2.thrift.port=<listening-port> \
903+
--hiveconf hive.server2.thrift.bind.host=<listening-host> \
904+
--master <master-uri>
905+
...
906+
```
907+
{% endhighlight %}
908+
909+
Now you can use beeline to test the Thrift JDBC server:
888910

889911
./bin/beeline
890912

@@ -930,7 +952,7 @@ SQL deprecates this property in favor of `spark.sql.shuffle.partitions`, whose d
930952
is 200. Users may customize this property via `SET`:
931953

932954
SET spark.sql.shuffle.partitions=10;
933-
SELECT page, count(*) c
955+
SELECT page, count(*) c
934956
FROM logs_last_month_cached
935957
GROUP BY page ORDER BY c DESC LIMIT 10;
936958

@@ -1139,7 +1161,7 @@ evaluated by the SQL execution engine. A full list of the functions supported c
11391161
<div data-lang="scala" markdown="1">
11401162

11411163
All data types of Spark SQL are located in the package `org.apache.spark.sql`.
1142-
You can access them by doing
1164+
You can access them by doing
11431165
{% highlight scala %}
11441166
import org.apache.spark.sql._
11451167
{% endhighlight %}
@@ -1245,7 +1267,7 @@ import org.apache.spark.sql._
12451267
<tr>
12461268
<td> <b>StructType</b> </td>
12471269
<td> org.apache.spark.sql.Row </td>
1248-
<td>
1270+
<td>
12491271
StructType(<i>fields</i>)<br />
12501272
<b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
12511273
name are not allowed.
@@ -1267,7 +1289,7 @@ import org.apache.spark.sql._
12671289

12681290
All data types of Spark SQL are located in the package of
12691291
`org.apache.spark.sql.api.java`. To access or create a data type,
1270-
please use factory methods provided in
1292+
please use factory methods provided in
12711293
`org.apache.spark.sql.api.java.DataType`.
12721294

12731295
<table class="table">
@@ -1373,7 +1395,7 @@ please use factory methods provided in
13731395
<tr>
13741396
<td> <b>StructType</b> </td>
13751397
<td> org.apache.spark.sql.api.java </td>
1376-
<td>
1398+
<td>
13771399
DataType.createStructType(<i>fields</i>)<br />
13781400
<b>Note:</b> <i>fields</i> is a List or an array of StructFields.
13791401
Also, two fields with the same name are not allowed.
@@ -1394,7 +1416,7 @@ please use factory methods provided in
13941416
<div data-lang="python" markdown="1">
13951417

13961418
All data types of Spark SQL are located in the package of `pyspark.sql`.
1397-
You can access them by doing
1419+
You can access them by doing
13981420
{% highlight python %}
13991421
from pyspark.sql import *
14001422
{% endhighlight %}
@@ -1518,7 +1540,7 @@ from pyspark.sql import *
15181540
<tr>
15191541
<td> <b>StructType</b> </td>
15201542
<td> list or tuple </td>
1521-
<td>
1543+
<td>
15221544
StructType(<i>fields</i>)<br />
15231545
<b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
15241546
name are not allowed.

ec2/deploy.generic/root/spark-ec2/ec2-variables.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,5 @@ export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
3030
export SWAP_MB="{{swap}}"
3131
export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
3232
export SPARK_MASTER_OPTS="{{spark_master_opts}}"
33+
export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}"
34+
export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}"

0 commit comments

Comments
 (0)