From 9798dd4b1246fb50a3b4ec1bc0a9e57860020267 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Wed, 23 Jan 2019 13:25:04 +0800 Subject: [PATCH 1/2] Disable dictionary filtering by default at Parquet --- .../datasources/parquet/ParquetFileFormat.scala | 11 +++++++++++ .../datasources/parquet/ParquetQuerySuite.scala | 15 +++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index f04502d113acb..b58545b4321f3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -314,6 +314,17 @@ class ParquetFileFormat SQLConf.CASE_SENSITIVE.key, sparkSession.sessionState.conf.caseSensitiveAnalysis) + // There are three things to note here. + // + // 1. Dictionary filtering has an issue about the predication on null. For this reason, + // This filtering is disabled. See SPARK-26677. + // + // 2. We should disable 'parquet.filter.dictionary.enabled' but + // the 'parquet.filter.stats.enabled' and 'parquet.filter.dictionary.enabled' were + // swapped mistakenly in Parquet side. It should use 'parquet.filter.dictionary.enabled' + // when Spark upgrades Parquet. See PARQUET-1309. + hadoopConf.setIfUnset(ParquetInputFormat.STATS_FILTERING_ENABLED, "false") + ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) // Sets flags for `ParquetToSparkSchemaConverter` diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index ce1dc6e159c61..beb89d91c9266 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -890,6 +890,21 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext } } } + + test("SPARK-26677: negated null-safe equality comparison should not filter matched row groups") { + (true :: false :: Nil).foreach { vectorized => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { + withTempPath { path => + // Repeated values for dictionary encoding. + Seq(Some("A"), Some("A"), None).toDF.repartition(1) + .write.parquet(path.getAbsolutePath) + val df = spark.read.parquet(path.getAbsolutePath) + checkAnswer(stripSparkFilter(df.where("NOT (value <=> 'A')")), df) + } + } + } + } + } object TestingUDT { From eb2eb2bd93394517716df9f47ab650078544e184 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Wed, 23 Jan 2019 14:03:35 +0800 Subject: [PATCH 2/2] typo at comments --- .../sql/execution/datasources/parquet/ParquetFileFormat.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index b58545b4321f3..5580dec3ff269 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -314,7 +314,7 @@ class ParquetFileFormat SQLConf.CASE_SENSITIVE.key, sparkSession.sessionState.conf.caseSensitiveAnalysis) - // There are three things to note here. + // There are two things to note here. // // 1. Dictionary filtering has an issue about the predication on null. For this reason, // This filtering is disabled. See SPARK-26677.