Skip to content

Commit d9b2ce0

Browse files
pengbodongjoon-hyun
authored andcommitted
[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values
## What changes were proposed in this pull request? This PR is follow up of apache#24286. As gatorsmile pointed out that column with null value is inaccurate as well. ``` > select key from test; 2 NULL 1 spark-sql> desc extended test key; col_name key data_type int comment NULL min 1 max 2 num_nulls 1 distinct_count 2 ``` The distinct count should be distinct_count + 1 when column contains null value. ## How was this patch tested? Existing tests & new UT added. Closes apache#24436 from pengbo/aggregation_estimation. Authored-by: pengbo <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 93a264d commit d9b2ce0

File tree

2 files changed

+14
-3
lines changed

2 files changed

+14
-3
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ object AggregateEstimation {
4242
(res, expr) => {
4343
val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute])
4444
val distinctCount = columnStat.distinctCount.get
45-
val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) {
46-
1
45+
val distinctValue: BigInt = if (columnStat.nullCount.get > 0) {
46+
distinctCount + 1
4747
} else {
4848
distinctCount
4949
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
4040
attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
4141
nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
4242
attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
43-
nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
43+
nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)),
44+
attr("key33") -> ColumnStat(distinctCount = Some(2), min = None, max = None,
45+
nullCount = Some(2), avgLen = Some(4), maxLen = Some(4))
4446
))
4547

4648
private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
@@ -126,6 +128,15 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
126128
expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
127129
}
128130

131+
test("group-by column with null value") {
132+
checkAggStats(
133+
tableColumns = Seq("key21", "key33"),
134+
tableRowCount = 6,
135+
groupByColumns = Seq("key21", "key33"),
136+
expectedOutputRowCount = nameToColInfo("key21")._2.distinctCount.get *
137+
(nameToColInfo("key33")._2.distinctCount.get + 1))
138+
}
139+
129140
test("non-cbo estimation") {
130141
val attributes = Seq("key12").map(nameToAttr)
131142
val child = StatsTestPlan(

0 commit comments

Comments
 (0)