SPARK-1019: pyspark RDD take() throws an NPE

pwendell · pwendell · commit 4ea23db0efff · 2014-03-12T23:16:59.000-07:00
Author: Patrick Wendell <pwendell@gmail.com> Closes #112 from pwendell/pyspark-take and squashes the following commits: daae80e [Patrick Wendell] SPARK-1019: pyspark RDD take() throws an NPE
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -46,6 +46,7 @@ class TaskContext(
   }
 
   def executeOnCompleteCallbacks() {
-    onCompleteCallbacks.foreach{_()}
+    // Process complete callbacks in the reverse order of registration
+    onCompleteCallbacks.reverse.foreach{_()}
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -100,6 +100,14 @@ private[spark] class PythonRDD[T: ClassTag](
       }
     }.start()
 
+    /*
+     * Partial fix for SPARK-1019: Attempts to stop reading the input stream since
+     * other completion callbacks might invalidate the input. Because interruption
+     * is not synchronous this still leaves a potential race where the interruption is
+     * processed only after the stream becomes invalid.
+     */
+    context.addOnCompleteCallback(() => context.interrupted = true)
+
     // Return an iterator that read lines from the process's stdout
     val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))
     val stdoutIterator = new Iterator[Array[Byte]] {

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ class TaskContext(`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`def executeOnCompleteCallbacks() {`
`49`		`- onCompleteCallbacks.foreach{_()}`
	`49`	`+ // Process complete callbacks in the reverse order of registration`
	`50`	`+ onCompleteCallbacks.reverse.foreach{_()}`
`50`	`51`	`}`
`51`	`52`	`}`