diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index f04502d113acb..5580dec3ff269 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -314,6 +314,17 @@ class ParquetFileFormat SQLConf.CASE_SENSITIVE.key, sparkSession.sessionState.conf.caseSensitiveAnalysis) + // There are two things to note here. + // + // 1. Dictionary filtering has an issue about the predication on null. For this reason, + // This filtering is disabled. See SPARK-26677. + // + // 2. We should disable 'parquet.filter.dictionary.enabled' but + // the 'parquet.filter.stats.enabled' and 'parquet.filter.dictionary.enabled' were + // swapped mistakenly in Parquet side. It should use 'parquet.filter.dictionary.enabled' + // when Spark upgrades Parquet. See PARQUET-1309. + hadoopConf.setIfUnset(ParquetInputFormat.STATS_FILTERING_ENABLED, "false") + ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) // Sets flags for `ParquetToSparkSchemaConverter` diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index ce1dc6e159c61..beb89d91c9266 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -890,6 +890,21 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext } } } + + test("SPARK-26677: negated null-safe equality comparison should not filter matched row groups") { + (true :: false :: Nil).foreach { vectorized => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { + withTempPath { path => + // Repeated values for dictionary encoding. + Seq(Some("A"), Some("A"), None).toDF.repartition(1) + .write.parquet(path.getAbsolutePath) + val df = spark.read.parquet(path.getAbsolutePath) + checkAnswer(stripSparkFilter(df.where("NOT (value <=> 'A')")), df) + } + } + } + } + } object TestingUDT {