[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values

pengbo · dongjoon-hyun · commit d9b2ce0f0f71 · 2019-04-22T20:30:08.000-07:00
## What changes were proposed in this pull request? This PR is follow up of apache#24286. As gatorsmile pointed out that column with null value is inaccurate as well. ``` > select key from test; 2 NULL 1 spark-sql> desc extended test key; col_name key data_type int comment NULL min 1 max 2 num_nulls 1 distinct_count 2 ``` The distinct count should be distinct_count + 1 when column contains null value. ## How was this patch tested? Existing tests & new UT added. Closes apache#24436 from pengbo/aggregation_estimation. Authored-by: pengbo <bo.peng1019@gmail.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
@@ -42,8 +42,8 @@ object AggregateEstimation {
         (res, expr) => {
           val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute])
           val distinctCount = columnStat.distinctCount.get
-          val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) {
-            1
+          val distinctValue: BigInt = if (columnStat.nullCount.get > 0) {
+            distinctCount + 1
           } else {
             distinctCount
           }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
@@ -40,7 +40,9 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
     attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
       nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
     attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
-      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
+      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)),
+    attr("key33") -> ColumnStat(distinctCount = Some(2), min = None, max = None,
+      nullCount = Some(2), avgLen = Some(4), maxLen = Some(4))
   ))
 
   private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
@@ -126,6 +128,15 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
       expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
   }
 
+  test("group-by column with null value") {
+    checkAggStats(
+      tableColumns = Seq("key21", "key33"),
+      tableRowCount = 6,
+      groupByColumns = Seq("key21", "key33"),
+      expectedOutputRowCount = nameToColInfo("key21")._2.distinctCount.get *
+        (nameToColInfo("key33")._2.distinctCount.get + 1))
+  }
+
   test("non-cbo estimation") {
     val attributes = Seq("key12").map(nameToAttr)
     val child = StatsTestPlan(