Merge remote-tracking branch 'upstream/master' into dimsumv2

rezazadeh · rezazadeh · commit 3467cff84d7f · 2014-09-25T17:18:09.000-07:00
Conflicts:
	mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ conf/*.cmd
 conf/*.properties
 conf/*.conf
 conf/*.xml
+conf/slaves
 docs/_site
 docs/api
 target/
diff --git a/.rat-excludes b/.rat-excludes
@@ -19,6 +19,7 @@ log4j.properties
 log4j.properties.template
 metrics.properties.template
 slaves
+slaves.template
 spark-env.sh
 spark-env.cmd
 spark-env.sh.template
diff --git a/conf/slaves.template b/conf/slaves.template
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
@@ -52,7 +52,7 @@ import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
 val data = sc.textFile("data/mllib/kmeans_data.txt")
-val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
+val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()
 
 // Cluster the data into two classes using KMeans
 val numClusters = 2
@@ -100,6 +100,7 @@ public class KMeansExample {
         }
       }
     );
+    parsedData.cache();
 
     // Cluster the data into two classes using KMeans
     int numClusters = 2;
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
@@ -396,7 +396,7 @@ val data = sc.textFile("data/mllib/ridge-data/lpsa.data")
 val parsedData = data.map { line =>
   val parts = line.split(',')
   LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
-}
+}.cache()
 
 // Building the model
 val numIterations = 100
@@ -455,6 +455,7 @@ public class LinearRegression {
         }
       }
     );
+    parsedData.cache();
 
     // Building the model
     int numIterations = 100;
@@ -470,7 +471,7 @@ public class LinearRegression {
         }
       }
     );
-    JavaRDD<Object> MSE = new JavaDoubleRDD(valuesAndPreds.map(
+    double MSE = new JavaDoubleRDD(valuesAndPreds.map(
       new Function<Tuple2<Double, Double>, Object>() {
         public Object call(Tuple2<Double, Double> pair) {
           return Math.pow(pair._1() - pair._2(), 2.0);
@@ -553,8 +554,8 @@ but in practice you will likely want to use unlabeled vectors for test data.
 
 {% highlight scala %}
 
-val trainingData = ssc.textFileStream('/training/data/dir').map(LabeledPoint.parse)
-val testData = ssc.textFileStream('/testing/data/dir').map(LabeledPoint.parse)
+val trainingData = ssc.textFileStream("/training/data/dir").map(LabeledPoint.parse).cache()
+val testData = ssc.textFileStream("/testing/data/dir").map(LabeledPoint.parse)
 
 {% endhighlight %}
 
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
@@ -217,6 +217,7 @@ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.mllib.classification.LogisticRegressionModel
+import org.apache.spark.mllib.optimization.{LBFGS, LogisticGradient, SquaredL2Updater}
 
 val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 val numFeatures = data.take(1)(0).features.size
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
@@ -62,7 +62,12 @@ Finally, the following configuration options can be passed to the master and wor
 
 # Cluster Launch Scripts
 
-To launch a Spark standalone cluster with the launch scripts, you need to create a file called `conf/slaves` in your Spark directory, which should contain the hostnames of all the machines where you would like to start Spark workers, one per line. The master machine must be able to access each of the slave machines via password-less `ssh` (using a private key). For testing, you can just put `localhost` in this file.
+To launch a Spark standalone cluster with the launch scripts, you should create a file called conf/slaves in your Spark directory,
+which must contain the hostnames of all the machines where you intend to start Spark workers, one per line.
+If conf/slaves does not exist, the launch scripts defaults to a single machine (localhost), which is useful for testing.
+Note, the master machine accesses each of the worker machines via ssh. By default, ssh is run in parallel and requires password-less (using a private key) access to be setup.
+If you do not have a password-less setup, you can set the environment variable SPARK_SSH_FOREGROUND and serially provide a password for each worker.
+
 
 Once you've set up this file, you can launch or stop your cluster with the following shell scripts, based on Hadoop's deploy scripts, and available in `SPARK_HOME/bin`:
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -67,11 +67,13 @@ class PythonMLLibAPI extends Serializable {
     MLUtils.loadLabeledPoints(jsc.sc, path, minPartitions)
 
   private def trainRegressionModel(
-      trainFunc: (RDD[LabeledPoint], Vector) => GeneralizedLinearModel,
+      learner: GeneralizedLinearAlgorithm[_ <: GeneralizedLinearModel],
       data: JavaRDD[LabeledPoint],
       initialWeightsBA: Array[Byte]): java.util.LinkedList[java.lang.Object] = {
     val initialWeights = SerDe.loads(initialWeightsBA).asInstanceOf[Vector]
-    val model = trainFunc(data.rdd, initialWeights)
+    // Disable the uncached input warning because 'data' is a deliberately uncached MappedRDD.
+    learner.disableUncachedWarning()
+    val model = learner.run(data.rdd, initialWeights)
     val ret = new java.util.LinkedList[java.lang.Object]()
     ret.add(SerDe.dumps(model.weights))
     ret.add(model.intercept: java.lang.Double)
@@ -106,8 +108,7 @@ class PythonMLLibAPI extends Serializable {
         + " Can only be initialized using the following string values: [l1, l2, none].")
     }
     trainRegressionModel(
-      (data, initialWeights) =>
-        lrAlg.run(data, initialWeights),
+      lrAlg,
       data,
       initialWeightsBA)
   }
@@ -122,15 +123,14 @@ class PythonMLLibAPI extends Serializable {
       regParam: Double,
       miniBatchFraction: Double,
       initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = {
+    val lassoAlg = new LassoWithSGD()
+    lassoAlg.optimizer
+      .setNumIterations(numIterations)
+      .setRegParam(regParam)
+      .setStepSize(stepSize)
+      .setMiniBatchFraction(miniBatchFraction)
     trainRegressionModel(
-      (data, initialWeights) =>
-        LassoWithSGD.train(
-          data,
-          numIterations,
-          stepSize,
-          regParam,
-          miniBatchFraction,
-          initialWeights),
+      lassoAlg,
       data,
       initialWeightsBA)
   }
@@ -145,15 +145,14 @@ class PythonMLLibAPI extends Serializable {
       regParam: Double,
       miniBatchFraction: Double,
       initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = {
+    val ridgeAlg = new RidgeRegressionWithSGD()
+    ridgeAlg.optimizer
+      .setNumIterations(numIterations)
+      .setRegParam(regParam)
+      .setStepSize(stepSize)
+      .setMiniBatchFraction(miniBatchFraction)
     trainRegressionModel(
-      (data, initialWeights) =>
-        RidgeRegressionWithSGD.train(
-          data,
-          numIterations,
-          stepSize,
-          regParam,
-          miniBatchFraction,
-          initialWeights),
+      ridgeAlg,
       data,
       initialWeightsBA)
   }
@@ -186,8 +185,7 @@ class PythonMLLibAPI extends Serializable {
         + " Can only be initialized using the following string values: [l1, l2, none].")
     }
     trainRegressionModel(
-      (data, initialWeights) =>
-        SVMAlg.run(data, initialWeights),
+      SVMAlg,
       data,
       initialWeightsBA)
   }
@@ -220,8 +218,7 @@ class PythonMLLibAPI extends Serializable {
         + " Can only be initialized using the following string values: [l1, l2, none].")
     }
     trainRegressionModel(
-      (data, initialWeights) =>
-        LogRegAlg.run(data, initialWeights),
+      LogRegAlg,
       data,
       initialWeightsBA)
   }
@@ -249,7 +246,14 @@ class PythonMLLibAPI extends Serializable {
       maxIterations: Int,
       runs: Int,
       initializationMode: String): KMeansModel = {
-    KMeans.train(data.rdd, k, maxIterations, runs, initializationMode)
+    val kMeansAlg = new KMeans()
+      .setK(k)
+      .setMaxIterations(maxIterations)
+      .setRuns(runs)
+      .setInitializationMode(initializationMode)
+      // Disable the uncached input warning because 'data' is a deliberately uncached MappedRDD.
+      .disableUncachedWarning()
+    return kMeansAlg.run(data.rdd)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -27,6 +27,7 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.random.XORShiftRandom
 
 /**
@@ -112,11 +113,26 @@ class KMeans private (
     this
   }
 
+  /** Whether a warning should be logged if the input RDD is uncached. */
+  private var warnOnUncachedInput = true
+
+  /** Disable warnings about uncached input. */
+  private[spark] def disableUncachedWarning(): this.type = {
+    warnOnUncachedInput = false
+    this
+  }  
+
   /**
    * Train a K-means model on the given set of points; `data` should be cached for high
    * performance, because this is an iterative algorithm.
    */
   def run(data: RDD[Vector]): KMeansModel = {
+
+    if (warnOnUncachedInput && data.getStorageLevel == StorageLevel.NONE) {
+      logWarning("The input data is not directly cached, which may hurt performance if its"
+        + " parent RDDs are also uncached.")
+    }
+
     // Compute squared norms and cache them.
     val norms = data.map(v => breezeNorm(v.toBreeze, 2.0))
     norms.persist()
@@ -125,6 +141,12 @@ class KMeans private (
     }
     val model = runBreeze(breezeData)
     norms.unpersist()
+
+    // Warn at the end of the run as well, for increased visibility.
+    if (warnOnUncachedInput && data.getStorageLevel == StorageLevel.NONE) {
+      logWarning("The input data was not directly cached, which may hurt performance if its"
+        + " parent RDDs are also uncached.")
+    }
     model
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -34,6 +34,7 @@ import org.apache.spark.mllib.rdd.RDDFunctions._
 import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.random.XORShiftRandom
+import org.apache.spark.storage.StorageLevel
 
 /**
  * :: Experimental ::
@@ -235,6 +236,10 @@ class RowMatrix(
         val brzSvd.SVD(uFull: BDM[Double], sigmaSquaresFull: BDV[Double], _) = brzSvd(G)
         (sigmaSquaresFull, uFull)
       case SVDMode.DistARPACK =>
+        if (rows.getStorageLevel == StorageLevel.NONE) {
+          logWarning("The input data is not directly cached, which may hurt performance if its"
+            + " parent RDDs are also uncached.")
+        }
         require(k < n, s"k must be smaller than n in dist-eigs mode but got k=$k and n=$n.")
         EigenValueDecomposition.symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIter)
     }
@@ -260,6 +265,12 @@ class RowMatrix(
       logWarning(s"Requested $k singular values but only found $sk nonzeros.")
     }
 
+    // Warn at the end of the run as well, for increased visibility.
+    if (computeMode == SVDMode.DistARPACK && rows.getStorageLevel == StorageLevel.NONE) {
+      logWarning("The input data was not directly cached, which may hurt performance if its"
+        + " parent RDDs are also uncached.")
+    }
+
     val s = Vectors.dense(Arrays.copyOfRange(sigmas.data, 0, sk))
     val V = Matrices.dense(n, sk, Arrays.copyOfRange(u.data, 0, n * sk))
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -24,6 +24,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.util.MLUtils._
+import org.apache.spark.storage.StorageLevel
 
 /**
  * :: DeveloperApi ::
@@ -133,6 +134,15 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
     this
   }
 
+  /** Whether a warning should be logged if the input RDD is uncached. */
+  private var warnOnUncachedInput = true
+
+  /** Disable warnings about uncached input. */
+  private[spark] def disableUncachedWarning(): this.type = {
+    warnOnUncachedInput = false
+    this
+  }
+
   /**
    * Run the algorithm with the configured parameters on an input
    * RDD of LabeledPoint entries.
@@ -149,6 +159,11 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    */
   def run(input: RDD[LabeledPoint], initialWeights: Vector): M = {
 
+    if (warnOnUncachedInput && input.getStorageLevel == StorageLevel.NONE) {
+      logWarning("The input data is not directly cached, which may hurt performance if its"
+        + " parent RDDs are also uncached.")
+    }
+
     // Check the data properties before running the optimizer
     if (validateData && !validators.forall(func => func(input))) {
       throw new SparkException("Input validation failed.")
@@ -223,6 +238,12 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
       weights = scaler.transform(weights)
     }
 
+    // Warn at the end of the run as well, for increased visibility.
+    if (warnOnUncachedInput && input.getStorageLevel == StorageLevel.NONE) {
+      logWarning("The input data was not directly cached, which may hurt performance if its"
+        + " parent RDDs are also uncached.")
+    }
+
     createModel(weights, intercept)
   }
 }
diff --git a/sbin/slaves.sh b/sbin/slaves.sh
@@ -44,7 +44,9 @@ sbin="`cd "$sbin"; pwd`"
 # If the slaves file is specified in the command line,
 # then it takes precedence over the definition in
 # spark-env.sh. Save it here.
-HOSTLIST="$SPARK_SLAVES"
+if [ -f "$SPARK_SLAVES" ]; then
+  HOSTLIST=`cat "$SPARK_SLAVES"`
+fi
 
 # Check if --config is passed as an argument. It is an optional parameter.
 # Exit if the argument is not a directory.
@@ -67,23 +69,34 @@ fi
 
 if [ "$HOSTLIST" = "" ]; then
   if [ "$SPARK_SLAVES" = "" ]; then
-    export HOSTLIST="${SPARK_CONF_DIR}/slaves"
+    if [ -f "${SPARK_CONF_DIR}/slaves" ]; then
+      HOSTLIST=`cat "${SPARK_CONF_DIR}/slaves"`
+    else
+      HOSTLIST=localhost
+    fi
   else
-    export HOSTLIST="${SPARK_SLAVES}"
+    HOSTLIST=`cat "${SPARK_SLAVES}"`
   fi
 fi
 
+
+
 # By default disable strict host key checking
 if [ "$SPARK_SSH_OPTS" = "" ]; then
   SPARK_SSH_OPTS="-o StrictHostKeyChecking=no"
 fi
 
-for slave in `cat "$HOSTLIST"|sed  "s/#.*$//;/^$/d"`; do
- ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \
-   2>&1 | sed "s/^/$slave: /" &
- if [ "$SPARK_SLAVE_SLEEP" != "" ]; then
-   sleep $SPARK_SLAVE_SLEEP
- fi
+for slave in `echo "$HOSTLIST"|sed  "s/#.*$//;/^$/d"`; do
+  if [ -n "${SPARK_SSH_FOREGROUND}" ]; then
+    ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \
+      2>&1 | sed "s/^/$slave: /"
+  else
+    ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \
+      2>&1 | sed "s/^/$slave: /" &
+  fi
+  if [ "$SPARK_SLAVE_SLEEP" != "" ]; then
+    sleep $SPARK_SLAVE_SLEEP
+  fi
 done
 
 wait

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ import org.apache.spark.mllib.linalg.Vectors`
`52`	`52`
`53`	`53`	`// Load and parse the data`
`54`	`54`	`val data = sc.textFile("data/mllib/kmeans_data.txt")`
`55`		`-val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))`
	`55`	`+val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()`
`56`	`56`
`57`	`57`	`// Cluster the data into two classes using KMeans`
`58`	`58`	`val numClusters = 2`
`@@ -100,6 +100,7 @@ public class KMeansExample {`
`100`	`100`	`}`
`101`	`101`	`}`
`102`	`102`	`);`
	`103`	`+ parsedData.cache();`
`103`	`104`
`104`	`105`	`// Cluster the data into two classes using KMeans`
`105`	`106`	`int numClusters = 2;`