ParquetFilters SQLConf.get

wangyum · wangyum · commit d57f44c8142c · 2018-06-22T20:19:18.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -381,12 +381,12 @@ object SQLConf {
   val PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD =
     buildConf("spark.sql.parquet.pushdown.inFilterThreshold")
       .doc("The maximum number of values to filter push-down optimization for IN predicate. " +
-        "Large threshold will not provide much better performance. " +
+        "Large threshold won't necessarily provide much better performance. " +
+        "The experiment argued that 300 is the limit threshold. " +
         "This configuration only has an effect when 'spark.sql.parquet.filterPushdown' is enabled.")
       .internal()
       .intConf
-      .checkValue(threshold => threshold > 0 && threshold <= 300,
-        "The threshold must be greater than 0 and less than 300.")
+      .checkValue(threshold => threshold > 0, "The threshold must be greater than 0.")
       .createWithDefault(10)
 
   val PARQUET_WRITE_LEGACY_FORMAT = buildConf("spark.sql.parquet.writeLegacyFormat")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -341,8 +341,6 @@ class ParquetFileFormat
     val enableParquetFilterPushDown: Boolean = sqlConf.parquetFilterPushDown
     // Whole stage codegen (PhysicalRDD) is able to deal with batches directly
     val returningBatch = supportBatch(sparkSession, resultSchema)
-    val pushDownDate = sqlConf.parquetFilterPushDownDate
-    val inThreshold = sqlConf.parquetFilterPushDownInFilterThreshold
 
     (file: PartitionedFile) => {
       assert(file.partitionValues.numFields == partitionSchema.size)
@@ -353,7 +351,7 @@ class ParquetFileFormat
           // Collects all converted Parquet filter predicates. Notice that not all predicates can be
           // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
           // is used here.
-          .flatMap(new ParquetFilters(pushDownDate, inThreshold).createFilter(requiredSchema, _))
+          .flatMap(new ParquetFilters().createFilter(requiredSchema, _))
           .reduceOption(FilterApi.and)
       } else {
         None
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -25,13 +25,19 @@ import org.apache.parquet.io.api.Binary
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLDate
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types._
 
 /**
  * Some utility function to convert Spark data source filters to Parquet filters.
  */
-private[parquet] class ParquetFilters(pushDownDate: Boolean, inFilterThreshold: Int) {
+private[parquet] class ParquetFilters {
+
+  val sqlConf: SQLConf = SQLConf.get
+
+  val pushDownDate = sqlConf.parquetFilterPushDownDate
+  val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold
 
   private def dateToDays(date: Date): SQLDate = {
     DateTimeUtils.fromJavaDate(date)
@@ -271,7 +277,7 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, inFilterThreshold:
         createFilter(schema, pred).map(FilterApi.not)
 
       case sources.In(name, values)
-        if canMakeFilterOn(name) && values.distinct.length <= inFilterThreshold =>
+        if canMakeFilterOn(name) && values.distinct.length <= pushDownInFilterThreshold =>
         values.distinct.flatMap { v =>
           makeEq.lift(nameToType(name)).map(_(name, v))
         }.reduceLeftOption(FilterApi.or)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -55,8 +55,7 @@ import org.apache.spark.util.{AccumulatorContext, AccumulatorV2}
  */
 class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
-  private lazy val parquetFilters =
-    new ParquetFilters(conf.parquetFilterPushDownDate, conf.parquetFilterPushDownInFilterThreshold)
+  private lazy val parquetFilters = new ParquetFilters()
 
   override def beforeEach(): Unit = {
     super.beforeEach()
@@ -702,8 +701,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
     import testImplicits._
     withTempPath { path =>
-      (0 to 1024).toDF("a").coalesce(1)
-        .write.option("parquet.block.size", 512)
+      (0 to 1024).toDF("a").selectExpr("if (a = 1024, null, a) AS a") // convert 1024 to null
+        .coalesce(1).write.option("parquet.block.size", 512)
         .parquet(path.getAbsolutePath)
       val df = spark.read.parquet(path.getAbsolutePath)
       Seq(true, false).foreach { pushEnabled =>
@@ -712,7 +711,9 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
           Seq(1, 5, 10, 11, 1000).foreach { count =>
             assert(df.where(s"a in(${Range(0, count).mkString(",")})").count() === count)
           }
-          assert(df.where(s"a in(null)").count() === 0)
+          assert(df.where("a in(null)").count() === 0)
+          assert(df.where("a = null").count() === 0)
+          assert(df.where("a is null").count() === 1)
         }
       }
     }