SPARK 1084.1 (resubmitted)

srowen · James Z.M. Gao · commit c6e339de4736 · 2014-03-11T10:18:45.000+08:00
(Ported from https://github.com/apache/incubator-spark/pull/637 ) Author: Sean Owen <sowen@cloudera.com> Closes apache#31 from srowen/SPARK-1084.1 and squashes the following commits: 6c4a32c [Sean Owen] Suppress warnings about legitimate unchecked array creations, or change code to avoid it f35b833 [Sean Owen] Fix two misc javadoc problems 254e8ef [Sean Owen] Fix one new style error introduced in scaladoc warning commit 5b2fce2 [Sean Owen] Fix scaladoc invocation warning, and enable javac warnings properly, with plugin config updates 007762b [Sean Owen] Remove dead scaladoc links b8ff8cb [Sean Owen] Replace deprecated Ant <tasks> with <target> Conflicts: bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala core/src/main/scala/org/apache/spark/util/StatCounter.scala streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
@@ -27,22 +27,24 @@ object Bagel extends Logging {
 
   /**
    * Runs a Bagel program.
-   * @param sc [[org.apache.spark.SparkContext]] to use for the program.
-   * @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the Key will be
-   *                 the vertex id.
-   * @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often this will be an
-   *                 empty array, i.e. sc.parallelize(Array[K, Message]()).
-   * @param combiner [[org.apache.spark.bagel.Combiner]] combines multiple individual messages to a given vertex into one
-   *                message before sending (which often involves network I/O).
-   * @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices after each superstep,
-   *                  and provides the result to each vertex in the next superstep.
-   * @param partitioner [[org.apache.spark.Partitioner]] partitions values by key
+   * @param sc org.apache.spark.SparkContext to use for the program.
+   * @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the
+   *                 Key will be the vertex id.
+   * @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often
+   *                 this will be an empty array, i.e. sc.parallelize(Array[K, Message]()).
+   * @param combiner [[org.apache.spark.bagel.Combiner]] combines multiple individual messages to a
+   *                 given vertex into one message before sending (which often involves network
+   *                 I/O).
+   * @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices
+   *                  after each superstep and provides the result to each vertex in the next
+   *                  superstep.
+   * @param partitioner org.apache.spark.Partitioner partitions values by key
    * @param numPartitions number of partitions across which to split the graph.
    *                      Default is the default parallelism of the SparkContext
-   * @param storageLevel [[org.apache.spark.storage.StorageLevel]] to use for caching of intermediate RDDs in each superstep.
-   *                    Defaults to caching in memory.
-   * @param compute function that takes a Vertex, optional set of (possibly combined) messages to the Vertex,
-   *                optional Aggregator and the current superstep,
+   * @param storageLevel org.apache.spark.storage.StorageLevel to use for caching of
+   *                    intermediate RDDs in each superstep. Defaults to caching in memory.
+   * @param compute function that takes a Vertex, optional set of (possibly combined) messages to
+   *                the Vertex, optional Aggregator and the current superstep,
    *                and returns a set of (Vertex, outgoing Messages) pairs
    * @tparam K key
    * @tparam V vertex type
@@ -127,8 +129,8 @@ object Bagel extends Logging {
   }
 
   /**
-   * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default [[org.apache.spark.HashPartitioner]]
-   * and default storage level
+   * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default
+   * org.apache.spark.HashPartitioner and default storage level
    */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
     sc: SparkContext,
@@ -140,7 +142,10 @@ object Bagel extends Logging {
     compute: (V, Option[C], Int) => (V, Array[M])
   ): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute)
 
-  /** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default [[org.apache.spark.HashPartitioner]]*/
+  /**
+   * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the
+   * default org.apache.spark.HashPartitioner
+   */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
     sc: SparkContext,
     vertices: RDD[(K, V)],
@@ -158,7 +163,8 @@ object Bagel extends Logging {
   }
 
   /**
-   * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default [[org.apache.spark.HashPartitioner]],
+   * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]],
+   * default org.apache.spark.HashPartitioner,
    * [[org.apache.spark.bagel.DefaultCombiner]] and the default storage level
    */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest](
@@ -171,7 +177,8 @@ object Bagel extends Logging {
   ): RDD[(K, V)] = run(sc, vertices, messages, numPartitions, DEFAULT_STORAGE_LEVEL)(compute)
 
   /**
-   * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], the default [[org.apache.spark.HashPartitioner]]
+   * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]],
+   * the default org.apache.spark.HashPartitioner
    * and [[org.apache.spark.bagel.DefaultCombiner]]
    */
   def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest](
diff --git a/core/pom.xml b/core/pom.xml
@@ -231,7 +231,7 @@
                         </goals>
                         <configuration>
                             <exportAntProperties>true</exportAntProperties>
-                            <tasks>
+                            <target>
                                 <property name="spark.classpath" refid="maven.test.classpath" />
                                 <property environment="env" />
                                 <fail message="Please set the SCALA_HOME (or SCALA_LIBRARY_PATH if scala is on the path) environment variables and retry.">
@@ -244,7 +244,7 @@
                                         </not>
                                     </condition>
                                 </fail>
-                            </tasks>
+                            </target>
                         </configuration>
                     </execution>
                 </executions>
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -350,7 +350,7 @@ class SparkContext(
    * using the older MapReduce API (`org.apache.hadoop.mapred`).
    *
    * @param conf JobConf for setting up the dataset
-   * @param inputFormatClass Class of the [[InputFormat]]
+   * @param inputFormatClass Class of the InputFormat
    * @param keyClass Class of the keys
    * @param valueClass Class of the values
    * @param minSplits Minimum number of Hadoop Splits to generate.
diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
@@ -80,7 +80,7 @@ class JobLogger(val user: String, val logDirName: String)
   /**
    * Create a log file for one job
    * @param jobID ID of the job
-   * @exception FileNotFoundException Fail to create log file
+   * @throws FileNotFoundException Fail to create log file
    */
   protected def createLogWriter(jobID: Int) {
     try {
diff --git a/core/src/main/scala/org/apache/spark/util/IndestructibleActorSystem.scala b/core/src/main/scala/org/apache/spark/util/IndestructibleActorSystem.scala
@@ -23,9 +23,9 @@ import scala.util.control.{ControlThrowable, NonFatal}
 import com.typesafe.config.Config
 
 /**
- * An [[akka.actor.ActorSystem]] which refuses to shut down in the event of a fatal exception.
+ * An akka.actor.ActorSystem which refuses to shut down in the event of a fatal exception
  * This is necessary as Spark Executors are allowed to recover from fatal exceptions
- * (see [[org.apache.spark.executor.Executor]]).
+ * (see org.apache.spark.executor.Executor)
  */
 object IndestructibleActorSystem {
   def apply(name: String, config: Config): ActorSystem =
diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
@@ -19,8 +19,9 @@ package org.apache.spark.util
 
 /**
  * A class for tracking the statistics of a set of numbers (count, mean and variance) in a
- * numerically robust way. Includes support for merging two StatCounters. Based on 
- * [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance Welford and Chan's algorithms for running variance]].
+ * numerically robust way. Includes support for merging two StatCounters. Based on Welford
+ * and Chan's [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance algorithms]]
+ * for running variance.
  *
  * @constructor Initialize the StatCounter with the given values.
  */
diff --git a/core/src/main/scala/org/apache/spark/util/Vector.scala b/core/src/main/scala/org/apache/spark/util/Vector.scala
@@ -128,7 +128,7 @@ object Vector {
 
   /**
    * Creates this [[org.apache.spark.util.Vector]] of given length containing random numbers 
-   * between 0.0 and 1.0. Optional [[scala.util.Random]] number generator can be provided.
+   * between 0.0 and 1.0. Optional scala.util.Random number generator can be provided.
    */
   def random(length: Int, random: Random = new XORShiftRandom()) = Vector(length, _ => random.nextDouble())
 
diff --git a/core/src/test/scala/org/apache/spark/JavaAPISuite.java b/core/src/test/scala/org/apache/spark/JavaAPISuite.java
@@ -76,8 +76,9 @@ public int compare(Integer a, Integer b) {
       else if (a < b) return 1;
       else return 0;
     }
-  };
+  }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void sparkContextUnion() {
     // Union of non-specialized JavaRDDs
@@ -149,6 +150,7 @@ public void call(String s) {
     Assert.assertEquals(2, foreachCalls);
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void lookup() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
@@ -180,6 +182,7 @@ public Boolean call(Integer x) {
     Assert.assertEquals(5, oddsAndEvens.lookup(false).get(0).size()); // Odds
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void cogroup() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
@@ -198,6 +201,7 @@ public void cogroup() {
     cogrouped.collect();
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void leftOuterJoin() {
     JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
@@ -244,6 +248,7 @@ public Integer call(Integer a, Integer b) {
     Assert.assertEquals(33, sum);
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void foldByKey() {
     List<Tuple2<Integer, Integer>> pairs = Arrays.asList(
@@ -266,6 +271,7 @@ public Integer call(Integer a, Integer b) {
     Assert.assertEquals(3, sums.lookup(3).get(0).intValue());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void reduceByKey() {
     List<Tuple2<Integer, Integer>> pairs = Arrays.asList(
@@ -321,8 +327,8 @@ public void approximateResults() {
   public void take() {
     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13));
     Assert.assertEquals(1, rdd.first().intValue());
-    List<Integer> firstTwo = rdd.take(2);
-    List<Integer> sample = rdd.takeSample(false, 2, 42);
+    rdd.take(2);
+    rdd.takeSample(false, 2, 42);
   }
 
   @Test
@@ -360,8 +366,8 @@ public Boolean call(Double x) {
     Assert.assertEquals(2.49444, rdd.stdev(), 0.01);
     Assert.assertEquals(2.73252, rdd.sampleStdev(), 0.01);
 
-    Double first = rdd.first();
-    List<Double> take = rdd.take(5);
+    rdd.first();
+    rdd.take(5);
   }
 
   @Test
@@ -439,11 +445,11 @@ public Iterable<Double> call(String s) {
         return lengths;
       }
     });
-    Double x = doubles.first();
-    Assert.assertEquals(5.0, doubles.first().doubleValue(), 0.01);
+    Assert.assertEquals(5.0, doubles.first(), 0.01);
     Assert.assertEquals(11, pairs.count());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void mapsFromPairsToPairs() {
       List<Tuple2<Integer, String>> pairs = Arrays.asList(
@@ -510,6 +516,7 @@ public void repartition() {
     }
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void persist() {
     JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0));
@@ -574,6 +581,7 @@ public void textFilesCompressed() throws IOException {
     Assert.assertEquals(expected, readRDD.collect());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void sequenceFile() {
     File tempDir = Files.createTempDir();
@@ -603,6 +611,7 @@ public Tuple2<Integer, String> call(Tuple2<IntWritable, Text> pair) {
     Assert.assertEquals(pairs, readRDD.collect());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void writeWithNewAPIHadoopFile() {
     File tempDir = Files.createTempDir();
@@ -633,6 +642,7 @@ public String call(Tuple2<IntWritable, Text> x) {
     }).collect().toString());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void readWithNewAPIHadoopFile() throws IOException {
     File tempDir = Files.createTempDir();
@@ -675,6 +685,7 @@ public void objectFilesOfInts() {
     Assert.assertEquals(expected, readRDD.collect());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void objectFilesOfComplexTypes() {
     File tempDir = Files.createTempDir();
@@ -691,6 +702,7 @@ public void objectFilesOfComplexTypes() {
     Assert.assertEquals(pairs, readRDD.collect());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void hadoopFile() {
     File tempDir = Files.createTempDir();
@@ -720,6 +732,7 @@ public String call(Tuple2<IntWritable, Text> x) {
     }).collect().toString());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void hadoopFileCompressed() {
     File tempDir = Files.createTempDir();
@@ -825,7 +838,7 @@ public Float zero(Float initialValue) {
       }
     };
 
-    final Accumulator<Float> floatAccum = sc.accumulator((Float) 10.0f, floatAccumulatorParam);
+    final Accumulator<Float> floatAccum = sc.accumulator(10.0f, floatAccumulatorParam);
     rdd.foreach(new VoidFunction<Integer>() {
       public void call(Integer x) {
         floatAccum.add((float) x);
@@ -877,6 +890,7 @@ public void checkpointAndRestore() {
     Assert.assertEquals(Arrays.asList(1, 2, 3, 4, 5), recovered.collect());
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void mapOnPairRDD() {
     JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
@@ -901,6 +915,7 @@ public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) throws Excepti
 
   }
 
+  @SuppressWarnings("unchecked")
   @Test
   public void collectPartitions() {
     JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7), 3);
@@ -969,14 +984,14 @@ public void countApproxDistinctByKey() {
   @Test
   public void collectAsMapWithIntArrayValues() {
     // Regression test for SPARK-1040
-    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(new Integer[] { 1 }));
+    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1));
     JavaPairRDD<Integer, int[]> pairRDD = rdd.map(new PairFunction<Integer, Integer, int[]>() {
       @Override
       public Tuple2<Integer, int[]> call(Integer x) throws Exception {
         return new Tuple2<Integer, int[]>(x, new int[] { x });
       }
     });
     pairRDD.collect();  // Works fine
-    Map<Integer, int[]> map = pairRDD.collectAsMap();  // Used to crash with ClassCastException
+    pairRDD.collectAsMap();  // Used to crash with ClassCastException
   }
 }
diff --git a/pom.xml b/pom.xml
@@ -607,12 +607,13 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-compiler-plugin</artifactId>
-          <version>2.5.1</version>
+          <version>3.1</version>
           <configuration>
             <source>${java.version}</source>
             <target>${java.version}</target>
             <encoding>UTF-8</encoding>
             <maxmem>1024m</maxmem>
+            <fork>true</fork>
           </configuration>
         </plugin>
         <plugin>
@@ -627,7 +628,7 @@
         <plugin>
           <groupId>org.scalatest</groupId>
           <artifactId>scalatest-maven-plugin</artifactId>
-          <version>1.0-M2</version>
+          <version>1.0-RC2</version>
           <configuration>
             <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
             <junitxml>.</junitxml>
diff --git a/repl/pom.xml b/repl/pom.xml
@@ -112,7 +112,7 @@
             </goals>
             <configuration>
               <exportAntProperties>true</exportAntProperties>
-              <tasks>
+              <target>
                 <property name="spark.classpath" refid="maven.test.classpath" />
                 <property environment="env" />
                 <fail message="Please set the SCALA_HOME (or SCALA_LIBRARY_PATH if scala is on the path) environment variables and retry.">
@@ -125,7 +125,7 @@
                     </not>
                   </condition>
                 </fail>
-              </tasks>
+              </target>
             </configuration>
           </execution>
         </executions>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
diff --git a/yarn/pom.xml b/yarn/pom.xml

Original file line number	Diff line number	Diff line change
`@@ -350,7 +350,7 @@ class SparkContext(`
`350`	`350`	* using the older MapReduce API (`org.apache.hadoop.mapred`).
`351`	`351`	`*`
`352`	`352`	`* @param conf JobConf for setting up the dataset`
`353`		`- * @param inputFormatClass Class of the [[InputFormat]]`
	`353`	`+ * @param inputFormatClass Class of the InputFormat`
`354`	`354`	`* @param keyClass Class of the keys`
`355`	`355`	`* @param valueClass Class of the values`
`356`	`356`	`* @param minSplits Minimum number of Hadoop Splits to generate.`