Add configurable maximum number of pivot values when none are given to prevent unintended OOM errors.

aray · aray · commit 676f1accde4c · 2015-11-11T15:31:50.000-06:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -309,13 +309,23 @@ class GroupedData protected[sql](
               s"The values of a pivot must be literals, found $other")
         }
       } else {
+        // This is to prevent unintended OOM errors when the number of distinct values is large
+        val maxValues = df.sqlContext.conf.getConf(SQLConf.DATAFRAME_PIVOT_MAX_VALUES)
         // Get the distinct values of the column and sort them so its consistent
-        df.select(pivotColumn)
+        val values = df.select(pivotColumn)
           .distinct()
           .sort(pivotColumn)
           .map(_.get(0))
-          .collect()
+          .take(maxValues + 1)
           .map(Literal(_)).toSeq
+        if (values.length > maxValues) {
+          throw new RuntimeException(
+            s"The pivot column $pivotColumn has more than $maxValues distinct values, " +
+              "this could indicate an error. " +
+              "If this was intended, set \"" + SQLConf.DATAFRAME_PIVOT_MAX_VALUES.key + "\" " +
+              s"to at least the number of distinct values of the pivot column.")
+        }
+        values
       }
       new GroupedData(df, groupingExprs, GroupedData.PivotType(pivotColumn.expr, pivotValues))
     case _ =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -453,6 +453,13 @@ private[spark] object SQLConf {
     defaultValue = Some(true),
     isPublic = false)
 
+  val DATAFRAME_PIVOT_MAX_VALUES = intConf(
+    "spark.sql.pivotMaxValues",
+    defaultValue = Some(10000),
+    doc = "When doing a pivot without specifying values for the pivot column this is the maximum " +
+      "number of (distinct) values that will be collected without error."
+  )
+
   val RUN_SQL_ON_FILES = booleanConf("spark.sql.runSQLOnFiles",
     defaultValue = Some(true),
     isPublic = false,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
@@ -75,4 +75,13 @@ class DataFramePivotSuite extends QueryTest with SharedSQLContext{
       Row("dotNET", 15000.0, 48000.0) :: Row("Java", 20000.0, 30000.0) :: Nil
     )
   }
+
+  test("pivot max values inforced") {
+    sqlContext.conf.setConf(SQLConf.DATAFRAME_PIVOT_MAX_VALUES, 1)
+    intercept[RuntimeException](
+      courseSales.groupBy($"year").pivot($"course")
+    )
+    sqlContext.conf.setConf(SQLConf.DATAFRAME_PIVOT_MAX_VALUES,
+      SQLConf.DATAFRAME_PIVOT_MAX_VALUES.defaultValue.get)
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -75,4 +75,13 @@ class DataFramePivotSuite extends QueryTest with SharedSQLContext{`
`75`	`75`	`Row("dotNET", 15000.0, 48000.0) :: Row("Java", 20000.0, 30000.0) :: Nil`
`76`	`76`	`)`
`77`	`77`	`}`
	`78`	`+`
	`79`	`+ test("pivot max values inforced") {`
	`80`	`+ sqlContext.conf.setConf(SQLConf.DATAFRAME_PIVOT_MAX_VALUES, 1)`
	`81`	`+ intercept[RuntimeException](`
	`82`	`+ courseSales.groupBy($"year").pivot($"course")`
	`83`	`+ )`
	`84`	`+ sqlContext.conf.setConf(SQLConf.DATAFRAME_PIVOT_MAX_VALUES,`
	`85`	`+ SQLConf.DATAFRAME_PIVOT_MAX_VALUES.defaultValue.get)`
	`86`	`+ }`
`78`	`87`	`}`