andrewor14
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala‎
Lines changed: 10 additions & 8 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 17 additions & 10 deletions b/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 17 additions & 10 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 9 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/FileSuite.scala‎
Lines changed: 42 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/FileSuite.scala‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala‎
Lines changed: 16 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala‎
Lines changed: 43 additions & 13 deletions b/‎core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala‎
Lines changed: 43 additions & 13 deletions
@@ -25,6 +25,8 @@ import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{SparkContext, SparkException}
 
+import scala.collection.JavaConversions._
+
 /**
  * Contains util methods to interact with Hadoop from Spark.
  */
@@ -33,15 +35,9 @@ class SparkHadoopUtil {
   UserGroupInformation.setConfiguration(conf)
 
   def runAsUser(user: String)(func: () => Unit) {
-    // if we are already running as the user intended there is no reason to do the doAs. It
-    // will actually break secure HDFS access as it doesn't fill in the credentials. Also if
-    // the user is UNKNOWN then we shouldn't be creating a remote unknown user
-    // (this is actually the path spark on yarn takes) since SPARK_USER is initialized only
-    // in SparkContext.
-    val currentUser = Option(System.getProperty("user.name")).
-      getOrElse(SparkContext.SPARK_UNKNOWN_USER)
-    if (user != SparkContext.SPARK_UNKNOWN_USER && currentUser != user) {
+    if (user != SparkContext.SPARK_UNKNOWN_USER) {
       val ugi = UserGroupInformation.createRemoteUser(user)
+      transferCredentials(UserGroupInformation.getCurrentUser(), ugi)
       ugi.doAs(new PrivilegedExceptionAction[Unit] {
         def run: Unit = func()
       })
@@ -50,6 +46,12 @@ class SparkHadoopUtil {
     }
   }
 
+  def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
+    for (token <- source.getTokens()) {
+      dest.addToken(token)
+    }
+  }
+
   /**
    * Return an appropriate (subclass) of Configuration. Creating config can initializes some Hadoop
    * subsystems.
 
@@ -30,18 +30,15 @@ import scala.reflect.ClassTag
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 import org.apache.hadoop.conf.{Configurable, Configuration}
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
-import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat}
-import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob}
-import org.apache.hadoop.mapreduce.{RecordWriter => NewRecordWriter}
+import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat, Job => NewAPIHadoopJob, RecordWriter => NewRecordWriter, JobContext, SparkHadoopMapReduceUtil}
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat}
 
 // SparkHadoopWriter and SparkHadoopMapReduceUtil are actually source files defined in Spark.
 import org.apache.hadoop.mapred.SparkHadoopWriter
-import org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil
 
 import org.apache.spark._
 import org.apache.spark.Partitioner.defaultPartitioner
@@ -604,8 +601,12 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
     val job = new NewAPIHadoopJob(conf)
     job.setOutputKeyClass(keyClass)
     job.setOutputValueClass(valueClass)
+
     val wrappedConf = new SerializableWritable(job.getConfiguration)
-    NewFileOutputFormat.setOutputPath(job, new Path(path))
+    val outpath = new Path(path)
+    NewFileOutputFormat.setOutputPath(job, outpath)
+    val jobFormat = outputFormatClass.newInstance
+    jobFormat.checkOutputSpecs(job)
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
     val jobtrackerID = formatter.format(new Date())
     val stageId = self.id
@@ -633,7 +634,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
       committer.commitTask(hadoopContext)
       return 1
     }
-    val jobFormat = outputFormatClass.newInstance
+
     /* apparently we need a TaskAttemptID to construct an OutputCommitter;
      * however we're only going to use this local OutputCommitter for
      * setupJob/commitJob, so we just use a dummy "map" task.
@@ -642,7 +643,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
     val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
     val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
     jobCommitter.setupJob(jobTaskContext)
-    val count = self.context.runJob(self, writeShard _).sum
+    self.context.runJob(self, writeShard _)
     jobCommitter.commitJob(jobTaskContext)
   }
 
@@ -696,10 +697,10 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
    * MapReduce job.
    */
   def saveAsHadoopDataset(conf: JobConf) {
-    val outputFormatClass = conf.getOutputFormat
+    val outputFormatInstance = conf.getOutputFormat
     val keyClass = conf.getOutputKeyClass
     val valueClass = conf.getOutputValueClass
-    if (outputFormatClass == null) {
+    if (outputFormatInstance == null) {
       throw new SparkException("Output format class not set")
     }
     if (keyClass == null) {
@@ -712,6 +713,12 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
     logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " +
       valueClass.getSimpleName + ")")
 
+    if (outputFormatInstance.isInstanceOf[FileOutputFormat[_, _]]) {
+      // FileOutputFormat ignores the filesystem parameter
+      val ignoredFs = FileSystem.get(conf)
+      conf.getOutputFormat.checkOutputSpecs(ignoredFs, conf)
+    }
+
     val writer = new SparkHadoopWriter(conf)
     writer.preSetup()
 
 
@@ -25,6 +25,7 @@ import scala.concurrent.duration._
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.HashSet
+import scala.util.Random
 
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
@@ -207,9 +208,11 @@ private[spark] class TaskSchedulerImpl(
       }
     }
 
-    // Build a list of tasks to assign to each worker
-    val tasks = offers.map(o => new ArrayBuffer[TaskDescription](o.cores))
-    val availableCpus = offers.map(o => o.cores).toArray
+    // Randomly shuffle offers to avoid always placing tasks on the same set of workers.
+    val shuffledOffers = Random.shuffle(offers)
+    // Build a list of tasks to assign to each worker.
+    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
+    val availableCpus = shuffledOffers.map(o => o.cores).toArray
     val sortedTaskSets = rootPool.getSortedTaskSetQueue()
     for (taskSet <- sortedTaskSets) {
       logDebug("parentName: %s, name: %s, runningTasks: %s".format(
@@ -222,9 +225,9 @@ private[spark] class TaskSchedulerImpl(
     for (taskSet <- sortedTaskSets; maxLocality <- TaskLocality.values) {
       do {
         launchedTask = false
-        for (i <- 0 until offers.size) {
-          val execId = offers(i).executorId
-          val host = offers(i).host
+        for (i <- 0 until shuffledOffers.size) {
+          val execId = shuffledOffers(i).executorId
+          val host = shuffledOffers(i).host
           for (task <- taskSet.resourceOffer(execId, host, availableCpus(i), maxLocality)) {
             tasks(i) += task
             val tid = task.taskId
 
@@ -24,9 +24,11 @@ import scala.io.Source
 import com.google.common.io.Files
 import org.apache.hadoop.io._
 import org.apache.hadoop.io.compress.DefaultCodec
+import org.apache.hadoop.mapred.FileAlreadyExistsException
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkContext._
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
 
 class FileSuite extends FunSuite with LocalSparkContext {
 
@@ -208,4 +210,44 @@ class FileSuite extends FunSuite with LocalSparkContext {
     assert(rdd.count() === 3)
     assert(rdd.count() === 3)
   }
+
+  test ("prevent user from overwriting the empty directory (old Hadoop API)") {
+    sc = new SparkContext("local", "test")
+    val tempdir = Files.createTempDir()
+    val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1)
+    intercept[FileAlreadyExistsException] {
+      randomRDD.saveAsTextFile(tempdir.getPath)
+    }
+  }
+
+  test ("prevent user from overwriting the non-empty directory (old Hadoop API)") {
+    sc = new SparkContext("local", "test")
+    val tempdir = Files.createTempDir()
+    val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1)
+    randomRDD.saveAsTextFile(tempdir.getPath + "/output")
+    assert(new File(tempdir.getPath + "/output/part-00000").exists() === true)
+    intercept[FileAlreadyExistsException] {
+      randomRDD.saveAsTextFile(tempdir.getPath + "/output")
+    }
+  }
+
+  test ("prevent user from overwriting the empty directory (new Hadoop API)") {
+    sc = new SparkContext("local", "test")
+    val tempdir = Files.createTempDir()
+    val randomRDD = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
+    intercept[FileAlreadyExistsException] {
+      randomRDD.saveAsNewAPIHadoopFile[TextOutputFormat[String, String]](tempdir.getPath)
+    }
+  }
+
+  test ("prevent user from overwriting the non-empty directory (new Hadoop API)") {
+    sc = new SparkContext("local", "test")
+    val tempdir = Files.createTempDir()
+    val randomRDD = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
+    randomRDD.saveAsTextFile(tempdir.getPath + "/output")
+    assert(new File(tempdir.getPath + "/output/part-00000").exists() === true)
+    intercept[FileAlreadyExistsException] {
+      randomRDD.saveAsNewAPIHadoopFile[TextOutputFormat[String, String]](tempdir.getPath)
+    }
+  }
 }
@@ -24,3 +24,19 @@ class FakeTask(stageId: Int, prefLocs: Seq[TaskLocation] = Nil) extends Task[Int
 
   override def preferredLocations: Seq[TaskLocation] = prefLocs
 }
+
+object FakeTask {
+  /**
+   * Utility method to create a TaskSet, potentially setting a particular sequence of preferred
+   * locations for each task (given as varargs) if this sequence is not empty.
+   */
+  def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = {
+    if (prefLocs.size != 0 && prefLocs.size != numTasks) {
+      throw new IllegalArgumentException("Wrong number of task locations")
+    }
+    val tasks = Array.tabulate[Task[_]](numTasks) { i =>
+      new FakeTask(i, if (prefLocs.size != 0) prefLocs(i) else Nil)
+    }
+    new TaskSet(tasks, 0, 0, 0, null)
+  }
+}
@@ -25,6 +25,13 @@ import org.scalatest.FunSuite
 
 import org.apache.spark._
 
+class FakeSchedulerBackend extends SchedulerBackend {
+  def start() {}
+  def stop() {}
+  def reviveOffers() {}
+  def defaultParallelism() = 1
+}
+
 class FakeTaskSetManager(
     initPriority: Int,
     initStageId: Int,
@@ -107,7 +114,8 @@ class FakeTaskSetManager(
 
 class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Logging {
 
-  def createDummyTaskSetManager(priority: Int, stage: Int, numTasks: Int, cs: TaskSchedulerImpl, taskSet: TaskSet): FakeTaskSetManager = {
+  def createDummyTaskSetManager(priority: Int, stage: Int, numTasks: Int, cs: TaskSchedulerImpl,
+      taskSet: TaskSet): FakeTaskSetManager = {
     new FakeTaskSetManager(priority, stage, numTasks, cs , taskSet)
   }
 
@@ -135,10 +143,7 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin
   test("FIFO Scheduler Test") {
     sc = new SparkContext("local", "TaskSchedulerImplSuite")
     val taskScheduler = new TaskSchedulerImpl(sc)
-    var tasks = ArrayBuffer[Task[_]]()
-    val task = new FakeTask(0)
-    tasks += task
-    val taskSet = new TaskSet(tasks.toArray,0,0,0,null)
+    val taskSet = FakeTask.createTaskSet(1)
 
     val rootPool = new Pool("", SchedulingMode.FIFO, 0, 0)
     val schedulableBuilder = new FIFOSchedulableBuilder(rootPool)
@@ -162,10 +167,7 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin
   test("Fair Scheduler Test") {
     sc = new SparkContext("local", "TaskSchedulerImplSuite")
     val taskScheduler = new TaskSchedulerImpl(sc)
-    var tasks = ArrayBuffer[Task[_]]()
-    val task = new FakeTask(0)
-    tasks += task
-    val taskSet = new TaskSet(tasks.toArray,0,0,0,null)
+    val taskSet = FakeTask.createTaskSet(1)
 
     val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
     System.setProperty("spark.scheduler.allocation.file", xmlPath)
@@ -219,10 +221,7 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin
   test("Nested Pool Test") {
     sc = new SparkContext("local", "TaskSchedulerImplSuite")
     val taskScheduler = new TaskSchedulerImpl(sc)
-    var tasks = ArrayBuffer[Task[_]]()
-    val task = new FakeTask(0)
-    tasks += task
-    val taskSet = new TaskSet(tasks.toArray,0,0,0,null)
+    val taskSet = FakeTask.createTaskSet(1)
 
     val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
     val pool0 = new Pool("0", SchedulingMode.FAIR, 3, 1)
@@ -265,4 +264,35 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin
     checkTaskSetId(rootPool, 6)
     checkTaskSetId(rootPool, 2)
   }
+
+  test("Scheduler does not always schedule tasks on the same workers") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc) 
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    var dagScheduler = new DAGScheduler(taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorGained(execId: String, host: String) {}
+    }
+
+    val numFreeCores = 1
+    val workerOffers = Seq(new WorkerOffer("executor0", "host0", numFreeCores),
+      new WorkerOffer("executor1", "host1", numFreeCores))
+    // Repeatedly try to schedule a 1-task job, and make sure that it doesn't always
+    // get scheduled on the same executor. While there is a chance this test will fail
+    // because the task randomly gets placed on the first executor all 1000 times, the
+    // probability of that happening is 2^-1000 (so sufficiently small to be considered
+    // negligible).
+    val numTrials = 1000
+    val selectedExecutorIds = 1.to(numTrials).map { _ =>
+      val taskSet = FakeTask.createTaskSet(1)
+      taskScheduler.submitTasks(taskSet)
+      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
+      assert(1 === taskDescriptions.length)
+      taskDescriptions(0).executorId
+    }
+    var count = selectedExecutorIds.count(_ == workerOffers(0).executorId)
+    assert(count > 0)
+    assert(count < numTrials)
+  }
 }