chore: Update documentation and ignore Spark SQL tests for known issue with count distinct on NaN in aggregate (#1847)

andygrove · web-flow · commit cdfdc2146f87 · 2025-06-06T12:59:30.000-06:00
diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff
@@ -247,7 +247,7 @@ index cf40e944c09..bdd5be4f462 100644
  
    test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
-index 1cc09c3d7fc..f031fa45c33 100644
+index 1cc09c3d7fc..b85b53a9688 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 @@ -27,7 +27,7 @@ import org.apache.spark.SparkException
@@ -268,6 +268,16 @@ index 1cc09c3d7fc..f031fa45c33 100644
        }
        assert(exchangePlans.length == 1)
      }
+@@ -1100,7 +1100,8 @@ class DataFrameAggregateSuite extends QueryTest
+     }
+   }
+ 
+-  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
++  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
++    IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
+     withTempView("view") {
+       val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
+       val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
 index 56e9520fdab..917932336df 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
diff --git a/dev/diffs/3.5.4.diff b/dev/diffs/3.5.4.diff
@@ -226,7 +226,7 @@ index 9815cb816c9..95b5f9992b0 100644
  
    test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
-index 5a8681aed97..da9d25e2eb4 100644
+index 5a8681aed97..db69fde723a 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Expand
@@ -247,6 +247,16 @@ index 5a8681aed97..da9d25e2eb4 100644
        }
        assert(exchangePlans.length == 1)
      }
+@@ -1255,7 +1255,8 @@ class DataFrameAggregateSuite extends QueryTest
+     }
+   }
+ 
+-  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
++  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
++    IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
+     withTempView("view") {
+       val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
+       val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
 index 56e9520fdab..917932336df 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
diff --git a/dev/diffs/3.5.5.diff b/dev/diffs/3.5.5.diff
@@ -226,7 +226,7 @@ index 9815cb816c9..95b5f9992b0 100644
  
    test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
-index 5a8681aed97..da9d25e2eb4 100644
+index 5a8681aed97..db69fde723a 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Expand
@@ -247,6 +247,16 @@ index 5a8681aed97..da9d25e2eb4 100644
        }
        assert(exchangePlans.length == 1)
      }
+@@ -1255,7 +1255,8 @@ class DataFrameAggregateSuite extends QueryTest
+     }
+   }
+ 
+-  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
++  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
++    IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
+     withTempView("view") {
+       val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
+       val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
 index 56e9520fdab..917932336df 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
diff --git a/dev/diffs/4.0.0-preview1.diff b/dev/diffs/4.0.0-preview1.diff
@@ -268,7 +268,7 @@ index d023fb82185..0f4f03bda6c 100644
  
        withTempView("t0", "t1", "t2") {
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
-index 620ee430cab..9d383a4bff9 100644
+index 620ee430cab..f5df9218fc1 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
 @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.util.AUTO_GENERATED_ALIAS
@@ -289,6 +289,16 @@ index 620ee430cab..9d383a4bff9 100644
        }
        assert(exchangePlans.length == 1)
      }
+@@ -1275,7 +1275,8 @@ class DataFrameAggregateSuite extends QueryTest
+     }
+   }
+ 
+-  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
++  test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
++    IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
+     withTempView("view") {
+       val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
+       val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
 index f6fd6b501d7..11870c85d82 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
diff --git a/docs/source/user-guide/compatibility.md b/docs/source/user-guide/compatibility.md
@@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i
 
 This guide offers information about areas of functionality where there are known differences.
 
-# Compatibility Guide
-
-Comet aims to provide consistent results with the version of Apache Spark that is being used.
-
-This guide offers information about areas of functionality where there are known differences.
-
 ## Parquet Scans
 
 Comet currently has three distinct implementations of the Parquet scan operator. The configuration property
@@ -89,6 +83,9 @@ because they are handled well in Spark (e.g., `SQLOrderingUtil.compareFloats`).
 functions of arrow-rs used by DataFusion do not normalize NaN and zero (e.g., [arrow::compute::kernels::cmp::eq](https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html#)).
 So Comet will add additional normalization expression of NaN and zero for comparison.
 
+There is a known bug with using count(distinct) within aggregate queries, where each NaN value will be counted
+separately [#1824](https://github.com/apache/datafusion-comet/issues/1824).
+
 ## Incompatible Expressions
 
 Some Comet native expressions are not 100% compatible with Spark and are disabled by default. These expressions
diff --git a/docs/templates/compatibility-template.md b/docs/templates/compatibility-template.md
@@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i
 
 This guide offers information about areas of functionality where there are known differences.
 
-# Compatibility Guide
-
-Comet aims to provide consistent results with the version of Apache Spark that is being used.
-
-This guide offers information about areas of functionality where there are known differences.
-
 ## Parquet Scans
 
 Comet currently has three distinct implementations of the Parquet scan operator. The configuration property
@@ -89,6 +83,9 @@ because they are handled well in Spark (e.g., `SQLOrderingUtil.compareFloats`).
 functions of arrow-rs used by DataFusion do not normalize NaN and zero (e.g., [arrow::compute::kernels::cmp::eq](https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html#)).
 So Comet will add additional normalization expression of NaN and zero for comparison.
 
+There is a known bug with using count(distinct) within aggregate queries, where each NaN value will be counted 
+separately [#1824](https://github.com/apache/datafusion-comet/issues/1824).
+
 ## Incompatible Expressions
 
 Some Comet native expressions are not 100% compatible with Spark and are disabled by default. These expressions