Avoid unnecessary closure cleaning

Andrew Or · Andrew Or · commit f7fe143214ed · 2015-05-18T23:31:33.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.spark.Logging
-import org.apache.spark.rdd.{RDD, UnionRDD}
+import org.apache.spark.{Logging, TaskContext}
+import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
@@ -184,7 +184,10 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         }
       }
 
-      dataRows.mapPartitions { iterator =>
+      // Since we know for sure that this closure is serializable, we can avoid the overhead
+      // of cleaning a closure for each RDD by creating our own MapPartitionsRDD. Functionally
+      // this is equivalent to calling `dataRows.mapPartitions(mapPartitionsFunc)` (SPARK-7718).
+      val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[Row]) => {
         val dataTypes = requiredColumns.map(schema(_).dataType)
         val mutableRow = new SpecificMutableRow(dataTypes)
         iterator.map { dataRow =>
@@ -196,6 +199,8 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
           mutableRow.asInstanceOf[expressions.Row]
         }
       }
+      new MapPartitionsRDD(dataRows, mapPartitionsFunc, preservesPartitioning = false)
+
     } else {
       dataRows
     }

Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,8 @@`
`17`	`17`
`18`	`18`	`package org.apache.spark.sql.sources`
`19`	`19`
`20`		`-import org.apache.spark.Logging`
`21`		`-import org.apache.spark.rdd.{RDD, UnionRDD}`
	`20`	`+import org.apache.spark.{Logging, TaskContext}`
	`21`	`+import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}`
`22`	`22`	`import org.apache.spark.sql.catalyst.expressions`
`23`	`23`	`import org.apache.spark.sql.catalyst.expressions._`
`24`	`24`	`import org.apache.spark.sql.catalyst.planning.PhysicalOperation`
`@@ -184,7 +184,10 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {`
`184`	`184`	`}`
`185`	`185`	`}`
`186`	`186`
`187`		`- dataRows.mapPartitions { iterator =>`
	`187`	`+ // Since we know for sure that this closure is serializable, we can avoid the overhead`
	`188`	`+ // of cleaning a closure for each RDD by creating our own MapPartitionsRDD. Functionally`
	`189`	+ // this is equivalent to calling `dataRows.mapPartitions(mapPartitionsFunc)` (SPARK-7718).
	`190`	`+ val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[Row]) => {`
`188`	`191`	`val dataTypes = requiredColumns.map(schema(_).dataType)`
`189`	`192`	`val mutableRow = new SpecificMutableRow(dataTypes)`
`190`	`193`	`iterator.map { dataRow =>`
`@@ -196,6 +199,8 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {`
`196`	`199`	`mutableRow.asInstanceOf[expressions.Row]`
`197`	`200`	`}`
`198`	`201`	`}`
	`202`	`+ new MapPartitionsRDD(dataRows, mapPartitionsFunc, preservesPartitioning = false)`
	`203`	`+`
`199`	`204`	`} else {`
`200`	`205`	`dataRows`
`201`	`206`	`}`