[SPARK-20409][SQL] fail early if aggregate function in GROUP BY

cloud-fan · hvanhovell · commit b91873db0930 · 2017-04-20T16:59:38.000+02:00
## What changes were proposed in this pull request? It's illegal to have aggregate function in GROUP BY, and we should fail at analysis phase, if this happens. ## How was this patch tested? new regression test Author: Wenchen Fan <wenchen@databricks.com> Closes #17704 from cloud-fan/minor.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -966,7 +966,7 @@ class Analyzer(
       case p if !p.childrenResolved => p
       // Replace the index with the related attribute for ORDER BY,
       // which is a 1-base position of the projection list.
-      case s @ Sort(orders, global, child)
+      case Sort(orders, global, child)
         if orders.exists(_.child.isInstanceOf[UnresolvedOrdinal]) =>
         val newOrders = orders map {
           case s @ SortOrder(UnresolvedOrdinal(index), direction, nullOrdering, _) =>
@@ -983,17 +983,11 @@ class Analyzer(
 
       // Replace the index with the corresponding expression in aggregateExpressions. The index is
       // a 1-base position of aggregateExpressions, which is output columns (select expression)
-      case a @ Aggregate(groups, aggs, child) if aggs.forall(_.resolved) &&
+      case Aggregate(groups, aggs, child) if aggs.forall(_.resolved) &&
         groups.exists(_.isInstanceOf[UnresolvedOrdinal]) =>
         val newGroups = groups.map {
-          case ordinal @ UnresolvedOrdinal(index) if index > 0 && index <= aggs.size =>
-            aggs(index - 1) match {
-              case e if ResolveAggregateFunctions.containsAggregate(e) =>
-                ordinal.failAnalysis(
-                  s"GROUP BY position $index is an aggregate function, and " +
-                    "aggregate functions are not allowed in GROUP BY")
-              case o => o
-            }
+          case u @ UnresolvedOrdinal(index) if index > 0 && index <= aggs.size =>
+            aggs(index - 1)
           case ordinal @ UnresolvedOrdinal(index) =>
             ordinal.failAnalysis(
               s"GROUP BY position $index is not in select list " +
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -254,6 +254,11 @@ trait CheckAnalysis extends PredicateHelper {
             }
 
             def checkValidGroupingExprs(expr: Expression): Unit = {
+              if (expr.find(_.isInstanceOf[AggregateExpression]).isDefined) {
+                failAnalysis(
+                  "aggregate functions are not allowed in GROUP BY, but found " + expr.sql)
+              }
+
               // Check if the data type of expr is orderable.
               if (!RowOrdering.isOrderable(expr.dataType)) {
                 failAnalysis(
@@ -271,8 +276,8 @@ trait CheckAnalysis extends PredicateHelper {
               }
             }
 
-            aggregateExprs.foreach(checkValidAggregateExpression)
             groupingExprs.foreach(checkValidGroupingExprs)
+            aggregateExprs.foreach(checkValidAggregateExpression)
 
           case Sort(orders, _, _) =>
             orders.foreach { order =>
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
@@ -122,7 +122,7 @@ select a, b, sum(b) from data group by 3
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 3 is an aggregate function, and aggregate functions are not allowed in GROUP BY; line 1 pos 39
+aggregate functions are not allowed in GROUP BY, but found sum(CAST(data.`b` AS BIGINT));
 
 
 -- !query 12
@@ -131,7 +131,7 @@ select a, b, sum(b) + 2 from data group by 3
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 3 is an aggregate function, and aggregate functions are not allowed in GROUP BY; line 1 pos 43
+aggregate functions are not allowed in GROUP BY, but found (sum(CAST(data.`b` AS BIGINT)) + CAST(2 AS BIGINT));
 
 
 -- !query 13
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -538,4 +538,11 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       Seq(Row(3, 0, 0.0, 1, 5.0), Row(2, 1, 4.0, 0, 0.0))
     )
   }
+
+  test("aggregate function in GROUP BY") {
+    val e = intercept[AnalysisException] {
+      testData.groupBy(sum($"key")).count()
+    }
+    assert(e.message.contains("aggregate functions are not allowed in GROUP BY"))
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -254,6 +254,11 @@ trait CheckAnalysis extends PredicateHelper {`
`254`	`254`	`}`
`255`	`255`
`256`	`256`	`def checkValidGroupingExprs(expr: Expression): Unit = {`
	`257`	`+ if (expr.find(_.isInstanceOf[AggregateExpression]).isDefined) {`
	`258`	`+ failAnalysis(`
	`259`	`+ "aggregate functions are not allowed in GROUP BY, but found " + expr.sql)`
	`260`	`+ }`
	`261`	`+`
`257`	`262`	`// Check if the data type of expr is orderable.`
`258`	`263`	`if (!RowOrdering.isOrderable(expr.dataType)) {`
`259`	`264`	`failAnalysis(`
`@@ -271,8 +276,8 @@ trait CheckAnalysis extends PredicateHelper {`
`271`	`276`	`}`
`272`	`277`	`}`
`273`	`278`
`274`		`- aggregateExprs.foreach(checkValidAggregateExpression)`
`275`	`279`	`groupingExprs.foreach(checkValidGroupingExprs)`
	`280`	`+ aggregateExprs.foreach(checkValidAggregateExpression)`
`276`	`281`
`277`	`282`	`case Sort(orders, _, _) =>`
`278`	`283`	`orders.foreach { order =>`
Original file line number	Diff line number	Diff line change
`@@ -538,4 +538,11 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {`
`538`	`538`	`Seq(Row(3, 0, 0.0, 1, 5.0), Row(2, 1, 4.0, 0, 0.0))`
`539`	`539`	`)`
`540`	`540`	`}`
	`541`	`+`
	`542`	`+ test("aggregate function in GROUP BY") {`
	`543`	`+ val e = intercept[AnalysisException] {`
	`544`	`+ testData.groupBy(sum($"key")).count()`
	`545`	`+ }`
	`546`	`+ assert(e.message.contains("aggregate functions are not allowed in GROUP BY"))`
	`547`	`+ }`
`541`	`548`	`}`