Skip to content

Commit cdfdc21

Browse files
authored
chore: Update documentation and ignore Spark SQL tests for known issue with count distinct on NaN in aggregate (#1847)
1 parent 87ef44c commit cdfdc21

File tree

6 files changed

+50
-16
lines changed

6 files changed

+50
-16
lines changed

dev/diffs/3.4.3.diff

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ index cf40e944c09..bdd5be4f462 100644
247247

248248
test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
249249
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
250-
index 1cc09c3d7fc..f031fa45c33 100644
250+
index 1cc09c3d7fc..b85b53a9688 100644
251251
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
252252
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
253253
@@ -27,7 +27,7 @@ import org.apache.spark.SparkException
@@ -268,6 +268,16 @@ index 1cc09c3d7fc..f031fa45c33 100644
268268
}
269269
assert(exchangePlans.length == 1)
270270
}
271+
@@ -1100,7 +1100,8 @@ class DataFrameAggregateSuite extends QueryTest
272+
}
273+
}
274+
275+
- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
276+
+ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
277+
+ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
278+
withTempView("view") {
279+
val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
280+
val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
271281
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
272282
index 56e9520fdab..917932336df 100644
273283
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala

dev/diffs/3.5.4.diff

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ index 9815cb816c9..95b5f9992b0 100644
226226

227227
test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
228228
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
229-
index 5a8681aed97..da9d25e2eb4 100644
229+
index 5a8681aed97..db69fde723a 100644
230230
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
231231
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
232232
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Expand
@@ -247,6 +247,16 @@ index 5a8681aed97..da9d25e2eb4 100644
247247
}
248248
assert(exchangePlans.length == 1)
249249
}
250+
@@ -1255,7 +1255,8 @@ class DataFrameAggregateSuite extends QueryTest
251+
}
252+
}
253+
254+
- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
255+
+ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
256+
+ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
257+
withTempView("view") {
258+
val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
259+
val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
250260
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
251261
index 56e9520fdab..917932336df 100644
252262
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala

dev/diffs/3.5.5.diff

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ index 9815cb816c9..95b5f9992b0 100644
226226

227227
test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
228228
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
229-
index 5a8681aed97..da9d25e2eb4 100644
229+
index 5a8681aed97..db69fde723a 100644
230230
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
231231
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
232232
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Expand
@@ -247,6 +247,16 @@ index 5a8681aed97..da9d25e2eb4 100644
247247
}
248248
assert(exchangePlans.length == 1)
249249
}
250+
@@ -1255,7 +1255,8 @@ class DataFrameAggregateSuite extends QueryTest
251+
}
252+
}
253+
254+
- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
255+
+ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
256+
+ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
257+
withTempView("view") {
258+
val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
259+
val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
250260
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
251261
index 56e9520fdab..917932336df 100644
252262
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala

dev/diffs/4.0.0-preview1.diff

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ index d023fb82185..0f4f03bda6c 100644
268268

269269
withTempView("t0", "t1", "t2") {
270270
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
271-
index 620ee430cab..9d383a4bff9 100644
271+
index 620ee430cab..f5df9218fc1 100644
272272
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
273273
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
274274
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.util.AUTO_GENERATED_ALIAS
@@ -289,6 +289,16 @@ index 620ee430cab..9d383a4bff9 100644
289289
}
290290
assert(exchangePlans.length == 1)
291291
}
292+
@@ -1275,7 +1275,8 @@ class DataFrameAggregateSuite extends QueryTest
293+
}
294+
}
295+
296+
- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") {
297+
+ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate",
298+
+ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) {
299+
withTempView("view") {
300+
val nan1 = java.lang.Float.intBitsToFloat(0x7f800001)
301+
val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff)
292302
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
293303
index f6fd6b501d7..11870c85d82 100644
294304
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala

docs/source/user-guide/compatibility.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i
2929

3030
This guide offers information about areas of functionality where there are known differences.
3131

32-
# Compatibility Guide
33-
34-
Comet aims to provide consistent results with the version of Apache Spark that is being used.
35-
36-
This guide offers information about areas of functionality where there are known differences.
37-
3832
## Parquet Scans
3933

4034
Comet currently has three distinct implementations of the Parquet scan operator. The configuration property
@@ -89,6 +83,9 @@ because they are handled well in Spark (e.g., `SQLOrderingUtil.compareFloats`).
8983
functions of arrow-rs used by DataFusion do not normalize NaN and zero (e.g., [arrow::compute::kernels::cmp::eq](https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html#)).
9084
So Comet will add additional normalization expression of NaN and zero for comparison.
9185

86+
There is a known bug with using count(distinct) within aggregate queries, where each NaN value will be counted
87+
separately [#1824](https://github.com/apache/datafusion-comet/issues/1824).
88+
9289
## Incompatible Expressions
9390

9491
Some Comet native expressions are not 100% compatible with Spark and are disabled by default. These expressions

docs/templates/compatibility-template.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i
2929

3030
This guide offers information about areas of functionality where there are known differences.
3131

32-
# Compatibility Guide
33-
34-
Comet aims to provide consistent results with the version of Apache Spark that is being used.
35-
36-
This guide offers information about areas of functionality where there are known differences.
37-
3832
## Parquet Scans
3933

4034
Comet currently has three distinct implementations of the Parquet scan operator. The configuration property
@@ -89,6 +83,9 @@ because they are handled well in Spark (e.g., `SQLOrderingUtil.compareFloats`).
8983
functions of arrow-rs used by DataFusion do not normalize NaN and zero (e.g., [arrow::compute::kernels::cmp::eq](https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html#)).
9084
So Comet will add additional normalization expression of NaN and zero for comparison.
9185

86+
There is a known bug with using count(distinct) within aggregate queries, where each NaN value will be counted
87+
separately [#1824](https://github.com/apache/datafusion-comet/issues/1824).
88+
9289
## Incompatible Expressions
9390

9491
Some Comet native expressions are not 100% compatible with Spark and are disabled by default. These expressions

0 commit comments

Comments
 (0)