follow comment

AngersZhuuuu · AngersZhuuuu · commit f21cf43fce75 · 2020-06-30T11:09:23.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -251,12 +251,32 @@ trait PredicateHelper extends Logging {
     resultStack.top
   }
 
+  /**
+   * Convert an expression to conjunctive normal form when pushing predicates through Join,
+   * when expand predicates, we can group by the qualifier avoiding generate unnecessary
+   * expression to control the length of final result since there are multiple tables.
+   * @param condition condition need to be convert
+   * @return expression seq in conjunctive normal form of input expression, if length exceeds
+   *         the threshold [[SQLConf.MAX_CNF_NODE_COUNT]] or length != 1, return empty Seq
+   */
   def conjunctiveNormalFormAndGroupExpsByQualifier(condition: Expression): Seq[Expression] = {
     conjunctiveNormalForm(condition,
       (expressions: Seq[Expression]) =>
         expressions.groupBy(_.references.map(_.qualifier)).map(_._2.reduceLeft(And)).toSeq)
   }
 
+  /**
+   * Convert an expression to conjunctive normal form when pushing predicates for partition pruning,
+   * when expand predicates, we can group by the reference avoiding generate unnecessary expression
+   * to control the length of final result since here we just have one table. In partition pruning
+   * strategies, we split filters by [[splitConjunctivePredicates]] and partition filters by judging
+   * if it's references is subset of partCols, if we combine expressions group by reference when
+   * expand predicate of [[Or]], it won't impact final predicate pruning result since
+   * [[splitConjunctivePredicates]] won't split [[Or]] expression.
+   * @param condition condition need to be convert
+   * @return expression seq in conjunctive normal form of input expression, if length exceeds
+   *         the threshold [[SQLConf.MAX_CNF_NODE_COUNT]] or length != 1, return empty Seq
+   */
   def conjunctiveNormalFormAndGroupExpsByReference(condition: Expression): Seq[Expression] = {
     conjunctiveNormalForm(condition,
       (expressions: Seq[Expression]) =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.types.StructType
 
 class PruneFileSourcePartitionsSuite extends PrunePartitionSuiteBase {
 
-  convert = "true"
+  override def format: String = "parquet"
 
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitionsSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor
 
 class PruneHiveTablePartitionsSuite extends PrunePartitionSuiteBase {
 
-  convert = "false"
+  override def format(): String = "hive"
 
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala
@@ -24,44 +24,44 @@ import org.apache.spark.sql.test.SQLTestUtils
 
 abstract class PrunePartitionSuiteBase extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
-  var convert: String = _
+  protected def format: String
 
   test("SPARK-28169: Convert scan predicate condition to CNF") {
-    withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> convert,
-      HiveUtils.CONVERT_METASTORE_ORC.key -> convert) {
-      withTable("t", "temp") {
+    withTempView("temp") {
+      withTable("t") {
         sql(
           s"""
-             |CREATE TABLE t(i int)
-             |PARTITIONED BY (p int)
-             |STORED AS PARQUET""".stripMargin)
+             |CREATE TABLE t(i INT, p STRING)
+             |USING $format
+             |PARTITIONED BY (p)""".stripMargin)
+
         spark.range(0, 1000, 1).selectExpr("id as col")
           .createOrReplaceTempView("temp")
 
         for (part <- Seq(1, 2, 3, 4)) {
           sql(
             s"""
                |INSERT OVERWRITE TABLE t PARTITION (p='$part')
-               |select col from temp""".stripMargin)
+               |SELECT col FROM temp""".stripMargin)
         }
 
         assertPrunedPartitions(
           "SELECT * FROM t WHERE p = '1' OR (p = '2' AND i = 1)", 2)
         assertPrunedPartitions(
-          "SELECT * FROM t WHERE (p = '1' and i = 2) or (i = 1 or p = '2')", 4)
+          "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (i = 1 OR p = '2')", 4)
         assertPrunedPartitions(
-          "SELECT * FROM t WHERE (p = '1' and i = 2) or (p = '3' and i = 3 )", 2)
+          "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (p = '3' AND i = 3 )", 2)
         assertPrunedPartitions(
-          "SELECT * FROM t WHERE (p = '1' and i = 2) or (p = '2' or p = '3')", 3)
+          "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (p = '2' OR p = '3')", 3)
         assertPrunedPartitions(
           "SELECT * FROM t", 4)
         assertPrunedPartitions(
-          "SELECT * FROM t where p = '1' and i = 2", 1)
+          "SELECT * FROM t WHERE p = '1' AND i = 2", 1)
         assertPrunedPartitions(
           """
             |SELECT i, COUNT(1) FROM (
-            |SELECT * FROM t where  p = '1' OR (p = '2' AND i = 1)
-            |) TMP GROUP BY i
+            |SELECT * FROM t WHERE  p = '1' OR (p = '2' AND i = 1)
+            |) tmp GROUP BY i
           """.stripMargin, 2)
       }
     }