infer schema

cloud-fan · cloud-fan · commit 4dfe604bcf70 · 2016-02-16T08:42:56.000+08:00
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1374,7 +1374,7 @@ def __init__(self, prev, func, output_schema=None):
         self._lazy_rdd = None
 
         if output_schema is not None:
-            # This transformation is applying schema, just copy member variables from prev.
+            # This transformation is adding schema, just copy member variables from prev.
             self.func = func
             self._prev_jdf = prev._prev_jdf
         elif not isinstance(prev, PipelinedDataFrame) or not prev.is_cached:
@@ -1385,16 +1385,22 @@ def __init__(self, prev, func, output_schema=None):
             self.func = _pipeline_func(prev.func, func)
             self._prev_jdf = prev._prev_jdf  # maintain the pipeline
 
-    def applySchema(self, schema):
+    def schema(self, schema):
         return PipelinedDataFrame(self, self.func, schema)
 
     @property
     def _jdf(self):
+        from pyspark.sql.types import _infer_type, _merge_type
+
         if self._jdf_val is None:
             if self.output_schema is None:
-                schema = StructType().add("binary", BinaryType(), False, {"pickled": True})
-                final_func = self.func
-            elif isinstance(self.output_schema, StructType):
+                # If no schema is specified, infer it from the whole data set.
+                jrdd = self._prev_jdf.javaToPython()
+                rdd = RDD(jrdd, self._sc, BatchedSerializer(PickleSerializer()))
+                func = self.func  # assign to a local varible to avoid referencing self in closure.
+                self.output_schema = rdd.mapPartitions(func).map(_infer_type).reduce(_merge_type)
+
+            if isinstance(self.output_schema, StructType):
                 schema = self.output_schema
                 to_row = lambda iterator: map(schema.toInternal, iterator)
                 final_func = _pipeline_func(self.func, to_row)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -1159,24 +1159,31 @@ def test_dataset(self):
         func = lambda row: {"key": row.key + 1, "value": row.value}  # convert row to python dict
         ds2 = ds.mapPartitions2(lambda iterator: map(func, iterator))
         schema = StructType().add("key", IntegerType()).add("value", StringType())
-        ds3 = ds2.applySchema(schema)
+        ds3 = ds2.schema(schema)
         result = ds3.select("key").collect()
         self.assertEqual(result[0][0], 2)
         self.assertEqual(result[1][0], 3)
 
         schema = StructType().add("value", StringType())  # use a different but compatible schema
-        ds3 = ds2.applySchema(schema)
+        ds3 = ds2.schema(schema)
         result = ds3.collect()
         self.assertEqual(result[0][0], "1")
         self.assertEqual(result[1][0], "2")
 
         func = lambda row: row.key * 3
         ds2 = ds.mapPartitions2(lambda iterator: map(func, iterator))
-        ds3 = ds2.applySchema(IntegerType())  # use a flat schema
+        ds3 = ds2.schema(IntegerType())  # use a flat schema
         result = ds3.collect()
         self.assertEqual(result[0][0], 3)
         self.assertEqual(result[1][0], 6)
 
+        result = ds2.collect()  # schema can be inferred automatically
+        self.assertEqual(result[0][0], 3)
+        self.assertEqual(result[1][0], 6)
+
+        # row count should be corrected even no schema is specified.
+        self.assertEqual(ds2.count(), 2)
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection, GenerateUnsafeRowJoiner}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.python.EvaluatePython
-import org.apache.spark.sql.types.{BinaryType, ObjectType, StructType}
+import org.apache.spark.sql.types.ObjectType
 
 /**
  * Helper functions for physical operators that work with user defined objects.
@@ -81,34 +81,22 @@ case class PythonMapPartitions(
 
   override def expressions: Seq[Expression] = Nil
 
-  private def isPickled(schema: StructType): Boolean = {
-    schema.length == 1 && schema.head.dataType == BinaryType &&
-      schema.head.metadata.contains("pickled")
-  }
-
   override protected def doExecute(): RDD[InternalRow] = {
     val inputRDD = child.execute().map(_.copy())
     val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
     val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
-    val childIsPickled = isPickled(child.schema)
-    val outputIsPickled = isPickled(schema)
 
     inputRDD.mapPartitions { iter =>
-      val inputIterator = if (childIsPickled) {
-        iter.map(_.getBinary(0))
-      } else {
-        EvaluatePython.registerPicklers()  // register pickler for Row
-
-        val pickle = new Pickler
-
-        // Input iterator to Python: input rows are grouped so we send them in batches to Python.
-        // For each row, add it to the queue.
-        iter.grouped(100).map { inputRows =>
-          val toBePickled = inputRows.map { row =>
-            EvaluatePython.toJava(row, child.schema)
-          }.toArray
-          pickle.dumps(toBePickled)
-        }
+      EvaluatePython.registerPicklers()  // register pickler for Row
+      val pickle = new Pickler
+
+      // Input iterator to Python: input rows are grouped so we send them in batches to Python.
+      // For each row, add it to the queue.
+      val inputIterator = iter.grouped(100).map { inputRows =>
+        val toBePickled = inputRows.map { row =>
+          EvaluatePython.toJava(row, child.schema)
+        }.toArray
+        pickle.dumps(toBePickled)
       }
 
       val context = TaskContext.get()
@@ -127,22 +115,14 @@ case class PythonMapPartitions(
           reuseWorker
         ).compute(inputIterator, context.partitionId(), context)
 
-      val resultProj = UnsafeProjection.create(output, output)
-
-      if (outputIsPickled) {
-        val row = new GenericMutableRow(1)
-        outputIterator.map { bytes =>
-          row(0) = bytes
-          resultProj(row)
-        }
-      } else {
-        val unpickle = new Unpickler
-        outputIterator.flatMap { pickedResult =>
-          val unpickledBatch = unpickle.loads(pickedResult)
-          unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
-        }.map { result =>
-          resultProj(EvaluatePython.fromJava(result, schema).asInstanceOf[InternalRow])
-        }
+      val unpickle = new Unpickler
+      val toUnsafe = UnsafeProjection.create(output, output)
+
+      outputIterator.flatMap { pickedResult =>
+        val unpickledBatch = unpickle.loads(pickedResult)
+        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
+      }.map { result =>
+        toUnsafe(EvaluatePython.fromJava(result, schema).asInstanceOf[InternalRow])
       }
     }
   }