[SPARK-33385][SQL] Support bucket pruning for IsNaN

wangyum · cloud-fan · commit 7a5647a93aae · 2020-11-09T09:20:31.000Z
### What changes were proposed in this pull request? This pr add support bucket pruning on `IsNaN` predicate. ### Why are the changes needed? Improve query performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #30291 from wangyum/SPARK-33385. Authored-by: Yuming Wang <yumwang@ebay.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.ScanOperation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
+import org.apache.spark.sql.types.{DoubleType, FloatType}
 import org.apache.spark.util.collection.BitSet
 
 /**
@@ -93,6 +94,12 @@ object FileSourceStrategy extends Strategy with PredicateHelper with Logging {
         getBucketSetFromIterable(a, hset)
       case expressions.IsNull(a: Attribute) if a.name == bucketColumnName =>
         getBucketSetFromValue(a, null)
+      case expressions.IsNaN(a: Attribute)
+        if a.name == bucketColumnName && a.dataType == FloatType =>
+        getBucketSetFromValue(a, Float.NaN)
+      case expressions.IsNaN(a: Attribute)
+        if a.name == bucketColumnName && a.dataType == DoubleType =>
+        getBucketSetFromValue(a, Double.NaN)
       case expressions.And(left, right) =>
         getExpressionBuckets(left, bucketColumnName, numBuckets) &
           getExpressionBuckets(right, bucketColumnName, numBuckets)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -113,7 +113,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
   //   2) Verify the final result is the same as the expected one
   private def checkPrunedAnswers(
       bucketSpec: BucketSpec,
-      bucketValues: Seq[Integer],
+      bucketValues: Seq[Any],
       filterCondition: Column,
       originalDataFrame: DataFrame): Unit = {
     // This test verifies parts of the plan. Disable whole stage codegen.
@@ -245,6 +245,25 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("bucket pruning support IsNaN") {
+    withTable("bucketed_table") {
+      val numBuckets = NumBucketsForPruningNullDf
+      val bucketSpec = BucketSpec(numBuckets, Seq("j"), Nil)
+      val naNDF = nullDF.selectExpr("i", "cast(if(isnull(j), 'NaN', j) as double) as j", "k")
+      // json does not support predicate push-down, and thus json is used here
+      naNDF.write
+        .format("json")
+        .bucketBy(numBuckets, "j")
+        .saveAsTable("bucketed_table")
+
+      checkPrunedAnswers(
+        bucketSpec,
+        bucketValues = Double.NaN :: Nil,
+        filterCondition = $"j".isNaN,
+        naNDF)
+    }
+  }
+
   test("read partitioning bucketed tables having composite filters") {
     withTable("bucketed_table") {
       val numBuckets = NumBucketsForPruningDF