From d6040ea0028754c7fe39ddcebb6bd027749acc4e Mon Sep 17 00:00:00 2001 From: Miles Yucht Date: Thu, 24 May 2018 17:16:37 +0200 Subject: [PATCH 1/5] Add tests, and small clean-up of the NOT IN pathway --- .../sql/catalyst/optimizer/subquery.scala | 9 +- .../org/apache/spark/sql/SubquerySuite.scala | 133 +++++++++++++++++- 2 files changed, 137 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index 709db6d8bec7d..de89e17e51f1b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -116,15 +116,16 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { // (a1,a2,...) = (b1,b2,...) // to // (a1=b1 OR isnull(a1=b1)) AND (a2=b2 OR isnull(a2=b2)) AND ... - val joinConds = splitConjunctivePredicates(joinCond.get) + val baseJoinConds = splitConjunctivePredicates(joinCond.get) + val nullAwareJoinConds = baseJoinConds.map(c => Or(c, IsNull(c))) // After that, add back the correlated join predicate(s) in the subquery // Example: // SELECT ... FROM A WHERE A.A1 NOT IN (SELECT B.B1 FROM B WHERE B.B2 = A.A2 AND B.B3 > 1) // will have the final conditions in the LEFT ANTI as - // (A.A1 = B.B1 OR ISNULL(A.A1 = B.B1)) AND (B.B2 = A.A2) - val pairs = (joinConds.map(c => Or(c, IsNull(c))) ++ conditions).reduceLeft(And) + // (A.A1 = B.B1 OR ISNULL(A.A1 = B.B1)) AND (B.B2 = A.A2) AND B.B3 > 1 + val finalJoinCond = (nullAwareJoinConds ++ conditions).reduceLeft(And) // Deduplicate conflicting attributes if any. - dedupJoin(Join(outerPlan, sub, LeftAnti, Option(pairs))) + dedupJoin(Join(outerPlan, sub, LeftAnti, Option(finalJoinCond))) case (p, predicate) => val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p) Project(p.output, Filter(newCond.get, inputPlan)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index acef62d81ee12..ecc148ecc6568 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -17,10 +17,12 @@ package org.apache.spark.sql +import org.scalatest.GivenWhenThen + import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.test.SharedSQLContext -class SubquerySuite extends QueryTest with SharedSQLContext { +class SubquerySuite extends QueryTest with SharedSQLContext with GivenWhenThen { import testImplicits._ setupTestData() @@ -275,6 +277,135 @@ class SubquerySuite extends QueryTest with SharedSQLContext { } + // ``col NOT IN expr'' is quite difficult to reason about. There are many edge cases, some of the + // rules are not very intuitive, and precedence and treatment of null values is somewhat + // unintuitive. To make this simpler to understand, I've come up with a plain English way of + // describing the expected behavior of this query. + // + // - If the subquery is empty (i.e. returns no rows), the row should be returned, regardless of + // whether the filtered columns include nulls. + // - If the subquery contains a result with all nulls, then the row should not be returned. + // - If for all non-null filter columns there exists a row in the subquery in which each column + // either + // 1. is equal to the corresponding filter column or + // 2. is null + // then the row should not be returned. (This includes the case where all filter columns are + // null.) + // - Otherwise, the row should be returned. + // + // Using these rules, we can come up with a set of test cases for single-column and multi-column + // NOT IN test cases. + test("NOT IN single column with nulls predicate subquery") { + // Test cases for single-column ``WHERE a NOT IN (SELECT c FROM r ...)'': + // | # | does subquery include null? | is a null? | a = c? | row with a included in result? | + // | 1 | empty | | | yes | + // | 2 | yes | | | no | + // | 3 | no | yes | | no | + // | 4 | no | no | yes | no | + // | 5 | no | no | no | yes | + Seq(row((null, 5.0)), row((3, 3.0))).toDF("a", "b").createOrReplaceTempView("m") + Seq(row((2, 3.0)), row((2, 3.0)), row((null, 5.0))).toDF("c", "d").createOrReplaceTempView("s") + + // Single-column test cases + val subqueryIsEmpty = "d > 6.0" + val cIncludesNull = "d = 5.0" + val cDoesNotMatchA = "d = 3.0" + val cMatchesA = "d = 5.0" + val aIsNull = "b = 5.0" + val aIsNotNull = "b = 3.0" + + val includesNullRow = Row(null, 5.0) :: Nil + val includesNotNullRow = Row(3, 3.0) :: Nil + val doesNotIncludeRow = Nil + + val singleColumnTestCases = Seq( + ("Case 1a (subquery is empty)", subqueryIsEmpty, aIsNull, includesNullRow), + ("Case 1b (subquery is empty)", subqueryIsEmpty, aIsNotNull, includesNotNullRow), + ("Case 2a (subquery includes null)", cIncludesNull, aIsNull, doesNotIncludeRow), + ("Case 2b (subquery includes null)", cIncludesNull, aIsNotNull, doesNotIncludeRow), + ("Case 3 (probe column is null)", cDoesNotMatchA, aIsNull, doesNotIncludeRow), + ("Case 4 (there is a match)", cMatchesA, aIsNotNull, doesNotIncludeRow), + ("Case 5 (there is no match)", cDoesNotMatchA, aIsNotNull, includesNotNullRow)) + + for ((given, sClause, mClause, expectedOutput) <- singleColumnTestCases) { + Given(given) + val query = s"SELECT * FROM m WHERE $mClause AND a NOT IN (SELECT c FROM s WHERE $sClause)" + checkAnswer(sql(query), expectedOutput) + } + + // Correlated subqueries should also be handled properly. The addition of the correlated + // subquery changes the query from case 2/3/4 to case 1. Because of this, the row from l should + // be included in the output. + val correlatedSubqueryTestCases = Seq( + ("Case 2a->1 (subquery had nulls)", cIncludesNull, aIsNull, includesNullRow), + ("Case 2b->1 (subquery had nulls)", cIncludesNull, aIsNotNull, includesNotNullRow), + ("Case 3->1 (probe column was null)", cMatchesA, aIsNull, includesNullRow), + ("Case 4->1 (there was a match)", cMatchesA, aIsNotNull, includesNotNullRow)) + for ((given, sClause, mClause, expectedOutput) <- correlatedSubqueryTestCases) { + Given(given) + // scalastyle:off + val query = + s"SELECT * FROM m WHERE $mClause AND a NOT IN (SELECT c FROM s WHERE $sClause AND c < b - 10)" + // scalastyle:on + checkAnswer(sql(query), expectedOutput) + } + } + + test("NOT IN multi column with nulls predicate subquery") { + // scalastyle:off + // Test cases for multi-column ``WHERE a NOT IN (SELECT c FROM r ...)'': + // | # | does subquery include null? | do filter columns contain null? | a = c? | b = d? | row included in result? | + // | 1 | empty | * | * | * | yes | + // | 2 | 1+ row has null for all columns | * | * | * | no | + // | 3 | no row has null for all columns | (yes, yes) | * | * | no | + // | 4 | no | (no, yes) | yes | * | no | + // | 5 | no row has null for all columns | (no, yes) | no | * | yes | + // | 6 | no | (no, no) | yes | yes | no | + // | 7 | no | (no, no) | _ | _ | yes | + // + // This can clearly be generalized, but only these cases are tested here. + // scalastyle:on + + Seq(row((null, null)), row((3, 5.0)), row((2, null)), row((2, 3.0))).toDF("a", "b") + .createOrReplaceTempView("m") + Seq(row((null, null)), row((2, 3.0)), row((3, null))).toDF("c", "d") + .createOrReplaceTempView("s") + + val subqueryIsEmpty = "c > 200" // Returns () + val dIsNull = "c = 3" // Returns (3, null) + val cAndDAreNull = "c IS NULL AND d IS NULL" // Returns (null, null) + val cAndDAreNotNull = "c = 2" // Returns (2, 3.0) + + val aAndBAreNull = "a IS NULL AND b IS NULL" // Returns (null, null) + val aAndBAreNotNull = "a = 3" // Returns (3, 5.0) + val aAndBMatch = "a = 2 AND b = 3.0" // Returns (2, 3.0) + val aIsNotNull = "a = 2" // Returns (2, null), (2, 3.0) + + val includesNullRow = Row(null, null) :: Nil + val includesSemiNullAndNotNullRow = Row(2, null) :: Row(2, 3.0) :: Nil + val includesPartiallyNullRow = Row(2, null) :: Nil + val includesNotNullRow = Row(3, 5.0) :: Nil + val doesNotIncludeRow = Nil + val multiColumnTestCases = Seq( + ("Case 1a (subquery is empty)", subqueryIsEmpty, aAndBAreNull, includesNullRow), + ("Case 1b (subquery is empty)", subqueryIsEmpty, aIsNotNull, includesSemiNullAndNotNullRow), + ("Case 2a (subquery contains null)", cAndDAreNull, aAndBAreNull, doesNotIncludeRow), + ("Case 2b (subquery contains null)", cAndDAreNull, aAndBAreNotNull, doesNotIncludeRow), + ("Case 3 (probe columns are all null)", dIsNull, aAndBAreNull, doesNotIncludeRow), + ("Case 4 (null column, match)", cAndDAreNotNull, aIsNotNull, doesNotIncludeRow), + ("Case 5 (null column, no match)", dIsNull, aIsNotNull, includesSemiNullAndNotNullRow), + ("Case 6 (no null column, match)", cAndDAreNotNull, aAndBMatch, doesNotIncludeRow), + ("Case 7 (no null column, no match)", cAndDAreNotNull, aAndBAreNotNull, includesNotNullRow)) + + for ((given, sClause, mClause, expectedOutput) <- multiColumnTestCases) { + Given(given) + val query = + s"SELECT * FROM m WHERE $mClause AND (a, b) NOT IN (SELECT c, d FROM s WHERE $sClause)" + checkAnswer(sql(query), expectedOutput) + } + } + + test("IN predicate subquery within OR") { checkAnswer( sql("select * from l where l.a in (select c from r)" + From 45962d65eaea0ff5ab5c5235076b93c44481f0b4 Mon Sep 17 00:00:00 2001 From: Miles Yucht Date: Fri, 25 May 2018 10:04:26 +0200 Subject: [PATCH 2/5] Port to SQLQueryTestSuite --- .../not-in-unit-tests-multi-column.sql | 98 ++++++++++++ .../not-in-unit-tests-single-column.sql | 123 +++++++++++++++ .../not-in-unit-tests-multi-column.sql.out | 134 ++++++++++++++++ .../not-in-unit-tests-single-column.sql.out | 149 ++++++++++++++++++ .../typeCoercion/native/concat.sql.out | 2 +- .../org/apache/spark/sql/SubquerySuite.scala | 133 +--------------- 6 files changed, 506 insertions(+), 133 deletions(-) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column.sql create mode 100644 sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out create mode 100644 sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column.sql new file mode 100644 index 0000000000000..9f8dc7fca3b94 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column.sql @@ -0,0 +1,98 @@ +-- Unit tests for simple NOT IN predicate subquery across multiple columns. +-- +-- See not-in-single-column-unit-tests.sql for an introduction. +-- +-- Test cases for multi-column ``WHERE a NOT IN (SELECT c FROM r ...)'': +-- | # | does subquery include null? | do filter columns contain null? | a = c? | b = d? | row included in result? | +-- | 1 | empty | * | * | * | yes | +-- | 2 | 1+ row has null for all columns | * | * | * | no | +-- | 3 | no row has null for all columns | (yes, yes) | * | * | no | +-- | 4 | no row has null for all columns | (no, yes) | yes | * | no | +-- | 5 | no row has null for all columns | (no, yes) | no | * | yes | +-- | 6 | no | (no, no) | yes | yes | no | +-- | 7 | no | (no, no) | _ | _ | yes | +-- +-- This can be generalized to include more tests for more columns, but it covers the main cases +-- when there is more than one column. + +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, null), + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b); + +CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES + (null, null), + (0, 1.0), + (2, 3.0), + (4, null) + AS s(c, d); + + -- Case 1 + -- (subquery is empty -> row is returned) +SELECT * +FROM m +WHERE (a, b) NOT IN (SELECT * + FROM s + WHERE d > 5.0) -- Matches no rows +; + + -- Case 2 + -- (subquery contains a row with null in all columns -> row not returned) +SELECT * +FROM m +WHERE (a, b) NOT IN (SELECT * + FROM s + WHERE c IS NULL AND d IS NULL) -- Matches only (null, null) +; + + -- Case 3 + -- (probe-side columns are all null -> row not returned) +SELECT * +FROM m +WHERE a IS NULL AND b IS NULL -- Matches only (null, null) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null) +; + + -- Case 4 + -- (one column null, other column matches a row in the subquery result -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null) +; + + -- Case 5 + -- (one null column with no match -> row is returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c = 2) -- Matches (2, 3.0) +; + + -- Case 6 + -- (no null columns with match -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Matches (2, 3.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c = 2) -- Matches (2, 3.0) +; + + -- Case 7 + -- (no null columns with no match -> row is returned) +SELECT * +FROM m +WHERE b = 5.0 -- Matches (4, 5.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c = 2) -- Matches (2, 3.0) +; diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column.sql new file mode 100644 index 0000000000000..2cc08e10acf67 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column.sql @@ -0,0 +1,123 @@ +-- Unit tests for simple NOT IN predicate subquery across a single column. +-- +-- ``col NOT IN expr'' is quite difficult to reason about. There are many edge cases, some of the +-- rules are confusing to the uninitiated, and precedence and treatment of null values is plain +-- unintuitive. To make this simpler to understand, I've come up with a plain English way of +-- describing the expected behavior of this query. +-- +-- - If the subquery is empty (i.e. returns no rows), the row should be returned, regardless of +-- whether the filtered columns include nulls. +-- - If the subquery contains a result with all columns null, then the row should not be returned. +-- - If for all non-null filter columns there exists a row in the subquery in which each column +-- either +-- 1. is equal to the corresponding filter column or +-- 2. is null +-- then the row should not be returned. (This includes the case where all filter columns are +-- null.) +-- - Otherwise, the row should be returned. +-- +-- Using these rules, we can come up with a set of test cases for single-column and multi-column +-- NOT IN test cases. +-- +-- Test cases for single-column ``WHERE a NOT IN (SELECT c FROM r ...)'': +-- | # | does subquery include null? | is a null? | a = c? | row with a included in result? | +-- | 1 | empty | | | yes | +-- | 2 | yes | | | no | +-- | 3 | no | yes | | no | +-- | 4 | no | no | yes | no | +-- | 5 | no | no | no | yes | +-- +-- There are also some considerations around correlated subqueries. Correlated subqueries can +-- cause cases 2, 3, or 4 to be reduced to case 1 by limiting the number of rows returned by the +-- subquery, so the row from the parent table should always be included in the output. + +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b); + +CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES + (null, 1.0), + (2, 3.0), + (6, 7.0) + AS s(c, d); + + -- Uncorrelated NOT IN Subquery test cases + -- Case 1 + -- (empty subquery -> all rows returned) +SELECT * +FROM m +WHERE a NOT IN (SELECT c + FROM s + WHERE d > 10.0) -- (empty subquery) +; + + -- Case 2 + -- (subquery includes null -> no rows returned) +SELECT * +FROM m +WHERE a NOT IN (SELECT c + FROM s + WHERE d = 1.0) -- Only matches (null, 1.0) +; + + -- Case 3 + -- (probe column is null -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Only matches (null, 1.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = 3.0) -- Matches (2, 3.0) +; + + -- Case 4 + -- (probe column matches subquery row -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = 3.0) -- Matches (2, 3.0) +; + + -- Case 5 + -- (probe column does not match subquery row -> row is returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = 7.0) -- Matches (6, 7.0) +; + + -- Correlated NOT IN subquery test cases + -- Case 2->1 + -- (subquery had nulls but they are removed by correlated subquery -> all rows returned) +SELECT * +FROM m +WHERE a NOT IN (SELECT c + FROM s + WHERE d = b + 10) -- Matches no row +; + + -- Case 3->1 + -- (probe column is null but subquery returns no rows -> row is returned) +SELECT * +FROM m +WHERE b = 1.0 -- Only matches (null, 1.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = b + 10) -- Matches no row +; + + -- Case 4->1 + -- (probe column matches row which is filtered out by correlated subquery -> row is returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = b + 10) -- Matches no row +; diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out new file mode 100644 index 0000000000000..aa5f64b8ebf55 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out @@ -0,0 +1,134 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 9 + + +-- !query 0 +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, null), + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES + (null, null), + (0, 1.0), + (2, 3.0), + (4, null) + AS s(c, d) +-- !query 1 schema +struct<> +-- !query 1 output + + + +-- !query 2 +-- Case 1 + -- (subquery is empty -> row is returned) +SELECT * +FROM m +WHERE (a, b) NOT IN (SELECT * + FROM s + WHERE d > 5.0) -- Matches no rows +-- !query 2 schema +struct +-- !query 2 output +2 3 +4 5 +NULL 1 +NULL NULL + + +-- !query 3 +-- Case 2 + -- (subquery contains a row with null in all columns -> row not returned) +SELECT * +FROM m +WHERE (a, b) NOT IN (SELECT * + FROM s + WHERE c IS NULL AND d IS NULL) -- Matches only (null, null) +-- !query 3 schema +struct +-- !query 3 output + + + +-- !query 4 +-- Case 3 + -- (probe-side columns are all null -> row not returned) +SELECT * +FROM m +WHERE a IS NULL AND b IS NULL -- Matches only (null, null) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null) +-- !query 4 schema +struct +-- !query 4 output + + + +-- !query 5 +-- Case 4 + -- (one column null, other column matches a row in the subquery result -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null) +-- !query 5 schema +struct +-- !query 5 output + + + +-- !query 6 +-- Case 5 + -- (one null column with no match -> row is returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c = 2) -- Matches (2, 3.0) +-- !query 6 schema +struct +-- !query 6 output +NULL 1 + + +-- !query 7 +-- Case 6 + -- (no null columns with match -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Matches (2, 3.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c = 2) -- Matches (2, 3.0) +-- !query 7 schema +struct +-- !query 7 output + + + +-- !query 8 +-- Case 7 + -- (no null columns with no match -> row is returned) +SELECT * +FROM m +WHERE b = 5.0 -- Matches (4, 5.0) + AND (a, b) NOT IN (SELECT * + FROM s + WHERE c = 2) -- Matches (2, 3.0) +-- !query 8 schema +struct +-- !query 8 output +4 5 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out new file mode 100644 index 0000000000000..f58ebeacc2872 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out @@ -0,0 +1,149 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 10 + + +-- !query 0 +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES + (null, 1.0), + (2, 3.0), + (6, 7.0) + AS s(c, d) +-- !query 1 schema +struct<> +-- !query 1 output + + + +-- !query 2 +-- Uncorrelated NOT IN Subquery test cases + -- Case 1 + -- (empty subquery -> all rows returned) +SELECT * +FROM m +WHERE a NOT IN (SELECT c + FROM s + WHERE d > 10.0) -- (empty subquery) +-- !query 2 schema +struct +-- !query 2 output +2 3 +4 5 +NULL 1 + + +-- !query 3 +-- Case 2 + -- (subquery includes null -> no rows returned) +SELECT * +FROM m +WHERE a NOT IN (SELECT c + FROM s + WHERE d = 1.0) -- Only matches (null, 1.0) +-- !query 3 schema +struct +-- !query 3 output + + + +-- !query 4 +-- Case 3 + -- (probe column is null -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Only matches (null, 1.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = 3.0) -- Matches (2, 3.0) +-- !query 4 schema +struct +-- !query 4 output + + + +-- !query 5 +-- Case 4 + -- (probe column matches subquery row -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = 3.0) -- Matches (2, 3.0) +-- !query 5 schema +struct +-- !query 5 output + + + +-- !query 6 +-- Case 5 + -- (probe column does not match subquery row -> row is returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = 7.0) -- Matches (6, 7.0) +-- !query 6 schema +struct +-- !query 6 output +2 3 + + +-- !query 7 +-- Correlated NOT IN subquery test cases + -- Case 2->1 + -- (subquery had nulls but they are removed by correlated subquery -> all rows returned) +SELECT * +FROM m +WHERE a NOT IN (SELECT c + FROM s + WHERE d = b + 10) -- Matches no row +-- !query 7 schema +struct +-- !query 7 output +2 3 +4 5 +NULL 1 + + +-- !query 8 +-- Case 3->1 + -- (probe column is null but subquery returns no rows -> row is returned) +SELECT * +FROM m +WHERE b = 1.0 -- Only matches (null, 1.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = b + 10) -- Matches no row +-- !query 8 schema +struct +-- !query 8 output +NULL 1 + + +-- !query 9 +-- Case 4->1 + -- (probe column matches row which is filtered out by correlated subquery -> row is returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (SELECT c + FROM s + WHERE d = b + 10) -- Matches no row +-- !query 9 schema +struct +-- !query 9 output +2 3 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out index 62befc5ca0f15..be637b66abc86 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 11 +-- Number of queries: 14 -- !query 0 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index ecc148ecc6568..acef62d81ee12 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -17,12 +17,10 @@ package org.apache.spark.sql -import org.scalatest.GivenWhenThen - import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.test.SharedSQLContext -class SubquerySuite extends QueryTest with SharedSQLContext with GivenWhenThen { +class SubquerySuite extends QueryTest with SharedSQLContext { import testImplicits._ setupTestData() @@ -277,135 +275,6 @@ class SubquerySuite extends QueryTest with SharedSQLContext with GivenWhenThen { } - // ``col NOT IN expr'' is quite difficult to reason about. There are many edge cases, some of the - // rules are not very intuitive, and precedence and treatment of null values is somewhat - // unintuitive. To make this simpler to understand, I've come up with a plain English way of - // describing the expected behavior of this query. - // - // - If the subquery is empty (i.e. returns no rows), the row should be returned, regardless of - // whether the filtered columns include nulls. - // - If the subquery contains a result with all nulls, then the row should not be returned. - // - If for all non-null filter columns there exists a row in the subquery in which each column - // either - // 1. is equal to the corresponding filter column or - // 2. is null - // then the row should not be returned. (This includes the case where all filter columns are - // null.) - // - Otherwise, the row should be returned. - // - // Using these rules, we can come up with a set of test cases for single-column and multi-column - // NOT IN test cases. - test("NOT IN single column with nulls predicate subquery") { - // Test cases for single-column ``WHERE a NOT IN (SELECT c FROM r ...)'': - // | # | does subquery include null? | is a null? | a = c? | row with a included in result? | - // | 1 | empty | | | yes | - // | 2 | yes | | | no | - // | 3 | no | yes | | no | - // | 4 | no | no | yes | no | - // | 5 | no | no | no | yes | - Seq(row((null, 5.0)), row((3, 3.0))).toDF("a", "b").createOrReplaceTempView("m") - Seq(row((2, 3.0)), row((2, 3.0)), row((null, 5.0))).toDF("c", "d").createOrReplaceTempView("s") - - // Single-column test cases - val subqueryIsEmpty = "d > 6.0" - val cIncludesNull = "d = 5.0" - val cDoesNotMatchA = "d = 3.0" - val cMatchesA = "d = 5.0" - val aIsNull = "b = 5.0" - val aIsNotNull = "b = 3.0" - - val includesNullRow = Row(null, 5.0) :: Nil - val includesNotNullRow = Row(3, 3.0) :: Nil - val doesNotIncludeRow = Nil - - val singleColumnTestCases = Seq( - ("Case 1a (subquery is empty)", subqueryIsEmpty, aIsNull, includesNullRow), - ("Case 1b (subquery is empty)", subqueryIsEmpty, aIsNotNull, includesNotNullRow), - ("Case 2a (subquery includes null)", cIncludesNull, aIsNull, doesNotIncludeRow), - ("Case 2b (subquery includes null)", cIncludesNull, aIsNotNull, doesNotIncludeRow), - ("Case 3 (probe column is null)", cDoesNotMatchA, aIsNull, doesNotIncludeRow), - ("Case 4 (there is a match)", cMatchesA, aIsNotNull, doesNotIncludeRow), - ("Case 5 (there is no match)", cDoesNotMatchA, aIsNotNull, includesNotNullRow)) - - for ((given, sClause, mClause, expectedOutput) <- singleColumnTestCases) { - Given(given) - val query = s"SELECT * FROM m WHERE $mClause AND a NOT IN (SELECT c FROM s WHERE $sClause)" - checkAnswer(sql(query), expectedOutput) - } - - // Correlated subqueries should also be handled properly. The addition of the correlated - // subquery changes the query from case 2/3/4 to case 1. Because of this, the row from l should - // be included in the output. - val correlatedSubqueryTestCases = Seq( - ("Case 2a->1 (subquery had nulls)", cIncludesNull, aIsNull, includesNullRow), - ("Case 2b->1 (subquery had nulls)", cIncludesNull, aIsNotNull, includesNotNullRow), - ("Case 3->1 (probe column was null)", cMatchesA, aIsNull, includesNullRow), - ("Case 4->1 (there was a match)", cMatchesA, aIsNotNull, includesNotNullRow)) - for ((given, sClause, mClause, expectedOutput) <- correlatedSubqueryTestCases) { - Given(given) - // scalastyle:off - val query = - s"SELECT * FROM m WHERE $mClause AND a NOT IN (SELECT c FROM s WHERE $sClause AND c < b - 10)" - // scalastyle:on - checkAnswer(sql(query), expectedOutput) - } - } - - test("NOT IN multi column with nulls predicate subquery") { - // scalastyle:off - // Test cases for multi-column ``WHERE a NOT IN (SELECT c FROM r ...)'': - // | # | does subquery include null? | do filter columns contain null? | a = c? | b = d? | row included in result? | - // | 1 | empty | * | * | * | yes | - // | 2 | 1+ row has null for all columns | * | * | * | no | - // | 3 | no row has null for all columns | (yes, yes) | * | * | no | - // | 4 | no | (no, yes) | yes | * | no | - // | 5 | no row has null for all columns | (no, yes) | no | * | yes | - // | 6 | no | (no, no) | yes | yes | no | - // | 7 | no | (no, no) | _ | _ | yes | - // - // This can clearly be generalized, but only these cases are tested here. - // scalastyle:on - - Seq(row((null, null)), row((3, 5.0)), row((2, null)), row((2, 3.0))).toDF("a", "b") - .createOrReplaceTempView("m") - Seq(row((null, null)), row((2, 3.0)), row((3, null))).toDF("c", "d") - .createOrReplaceTempView("s") - - val subqueryIsEmpty = "c > 200" // Returns () - val dIsNull = "c = 3" // Returns (3, null) - val cAndDAreNull = "c IS NULL AND d IS NULL" // Returns (null, null) - val cAndDAreNotNull = "c = 2" // Returns (2, 3.0) - - val aAndBAreNull = "a IS NULL AND b IS NULL" // Returns (null, null) - val aAndBAreNotNull = "a = 3" // Returns (3, 5.0) - val aAndBMatch = "a = 2 AND b = 3.0" // Returns (2, 3.0) - val aIsNotNull = "a = 2" // Returns (2, null), (2, 3.0) - - val includesNullRow = Row(null, null) :: Nil - val includesSemiNullAndNotNullRow = Row(2, null) :: Row(2, 3.0) :: Nil - val includesPartiallyNullRow = Row(2, null) :: Nil - val includesNotNullRow = Row(3, 5.0) :: Nil - val doesNotIncludeRow = Nil - val multiColumnTestCases = Seq( - ("Case 1a (subquery is empty)", subqueryIsEmpty, aAndBAreNull, includesNullRow), - ("Case 1b (subquery is empty)", subqueryIsEmpty, aIsNotNull, includesSemiNullAndNotNullRow), - ("Case 2a (subquery contains null)", cAndDAreNull, aAndBAreNull, doesNotIncludeRow), - ("Case 2b (subquery contains null)", cAndDAreNull, aAndBAreNotNull, doesNotIncludeRow), - ("Case 3 (probe columns are all null)", dIsNull, aAndBAreNull, doesNotIncludeRow), - ("Case 4 (null column, match)", cAndDAreNotNull, aIsNotNull, doesNotIncludeRow), - ("Case 5 (null column, no match)", dIsNull, aIsNotNull, includesSemiNullAndNotNullRow), - ("Case 6 (no null column, match)", cAndDAreNotNull, aAndBMatch, doesNotIncludeRow), - ("Case 7 (no null column, no match)", cAndDAreNotNull, aAndBAreNotNull, includesNotNullRow)) - - for ((given, sClause, mClause, expectedOutput) <- multiColumnTestCases) { - Given(given) - val query = - s"SELECT * FROM m WHERE $mClause AND (a, b) NOT IN (SELECT c, d FROM s WHERE $sClause)" - checkAnswer(sql(query), expectedOutput) - } - } - - test("IN predicate subquery within OR") { checkAnswer( sql("select * from l where l.a in (select c from r)" + From b37aa3e9f90d953c636af5dd161a4a66e64ff6f1 Mon Sep 17 00:00:00 2001 From: Miles Yucht Date: Fri, 25 May 2018 17:38:18 +0200 Subject: [PATCH 3/5] Add tests with literals --- ...not-in-unit-tests-multi-column-literal.sql | 58 +++++++++++ ...ot-in-unit-tests-single-column-literal.sql | 42 ++++++++ ...in-unit-tests-multi-column-literal.sql.out | 97 +++++++++++++++++++ ...n-unit-tests-single-column-literal.sql.out | 69 +++++++++++++ 4 files changed, 266 insertions(+) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql create mode 100644 sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out create mode 100644 sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql new file mode 100644 index 0000000000000..fd4518e2166e7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql @@ -0,0 +1,58 @@ +-- Unit tests for simple NOT IN predicate subquery across multiple columns. +-- +-- See not-in-single-column-unit-tests.sql for an introduction. +-- This file has the same test cases as not-in-unit-tests-multi-column.sql with literals instead of +-- subqueries. Small changes have been made to the literals to make them typecheck. + +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, null), + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b); + + -- Case 1 (not possible to write a literal with no rows, so we ignore it.) + -- (subquery is empty -> row is returned) + + -- Case 2 + -- (subquery contains a row with null in all columns -> row not returned) +SELECT * +FROM m +WHERE (a, b) NOT IN ((CAST (null AS INT), CAST (null AS DECIMAL(2, 1)))); + + -- Case 3 + -- (probe-side columns are all null -> row not returned) +SELECT * +FROM m +WHERE a IS NULL AND b IS NULL -- Matches only (null, null) + AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); + + -- Case 4 + -- (one column null, other column matches a row in the subquery result -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); + + -- Case 5 + -- (one null column with no match -> row is returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN ((2, 3.0)); + + -- Case 6 + -- (no null columns with match -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Matches (2, 3.0) + AND (a, b) NOT IN ((2, 3.0)) +; + + -- Case 7 + -- (no null columns with no match -> row is returned) +SELECT * +FROM m +WHERE b = 5.0 -- Matches (4, 5.0) + AND (a, b) NOT IN ((2, 3.0)) +; diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql new file mode 100644 index 0000000000000..b261363d1dde7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql @@ -0,0 +1,42 @@ +-- Unit tests for simple NOT IN with a literal expression of a single column +-- +-- More information can be found in not-in-unit-tests-single-column.sql. +-- This file has the same test cases as not-in-unit-tests-single-column.sql with literals instead of +-- subqueries. + +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b); + + -- Uncorrelated NOT IN Subquery test cases + -- Case 1 (not possible to write a literal with no rows, so we ignore it.) + -- (empty subquery -> all rows returned) + + -- Case 2 + -- (subquery includes null -> no rows returned) +SELECT * +FROM m +WHERE a NOT IN (null); + + -- Case 3 + -- (probe column is null -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Only matches (null, 1.0) + AND a NOT IN (2); + + -- Case 4 + -- (probe column matches subquery row -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (2); + + -- Case 5 + -- (probe column does not match subquery row -> row is returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (6); diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out new file mode 100644 index 0000000000000..737fdfad901f7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out @@ -0,0 +1,97 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 7 + + +-- !query 0 +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, null), + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +-- Case 1 (not possible to write a literal with no rows, so we ignore it.) + -- (subquery is empty -> row is returned) + + -- Case 2 + -- (subquery contains a row with null in all columns -> row not returned) +SELECT * +FROM m +WHERE (a, b) NOT IN ((CAST (null AS INT), CAST (null AS DECIMAL(2, 1)))) +-- !query 1 schema +struct +-- !query 1 output +2 3 +4 5 +NULL 1 + + +-- !query 2 +-- Case 3 + -- (probe-side columns are all null -> row not returned) +SELECT * +FROM m +WHERE a IS NULL AND b IS NULL -- Matches only (null, null) + AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))) +-- !query 2 schema +struct +-- !query 2 output +NULL NULL + + +-- !query 3 +-- Case 4 + -- (one column null, other column matches a row in the subquery result -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))) +-- !query 3 schema +struct +-- !query 3 output +NULL 1 + + +-- !query 4 +-- Case 5 + -- (one null column with no match -> row is returned) +SELECT * +FROM m +WHERE b = 1.0 -- Matches (null, 1.0) + AND (a, b) NOT IN ((2, 3.0)) +-- !query 4 schema +struct +-- !query 4 output +NULL 1 + + +-- !query 5 +-- Case 6 + -- (no null columns with match -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Matches (2, 3.0) + AND (a, b) NOT IN ((2, 3.0)) +-- !query 5 schema +struct +-- !query 5 output + + + +-- !query 6 +-- Case 7 + -- (no null columns with no match -> row is returned) +SELECT * +FROM m +WHERE b = 5.0 -- Matches (4, 5.0) + AND (a, b) NOT IN ((2, 3.0)) +-- !query 6 schema +struct +-- !query 6 output +4 5 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out new file mode 100644 index 0000000000000..446447e890449 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out @@ -0,0 +1,69 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 5 + + +-- !query 0 +CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES + (null, 1.0), + (2, 3.0), + (4, 5.0) + AS m(a, b) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +-- Uncorrelated NOT IN Subquery test cases + -- Case 1 (not possible to write a literal with no rows, so we ignore it.) + -- (empty subquery -> all rows returned) + + -- Case 2 + -- (subquery includes null -> no rows returned) +SELECT * +FROM m +WHERE a NOT IN (null) +-- !query 1 schema +struct +-- !query 1 output + + + +-- !query 2 +-- Case 3 + -- (probe column is null -> row not returned) +SELECT * +FROM m +WHERE b = 1.0 -- Only matches (null, 1.0) + AND a NOT IN (2) +-- !query 2 schema +struct +-- !query 2 output + + + +-- !query 3 +-- Case 4 + -- (probe column matches subquery row -> row not returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (2) +-- !query 3 schema +struct +-- !query 3 output + + + +-- !query 4 +-- Case 5 + -- (probe column does not match subquery row -> row is returned) +SELECT * +FROM m +WHERE b = 3.0 -- Only matches (2, 3.0) + AND a NOT IN (6) +-- !query 4 schema +struct +-- !query 4 output +2 3 From 4a55c183bd3a429c5e219c4941d69b9685cf2010 Mon Sep 17 00:00:00 2001 From: Miles Yucht Date: Sat, 26 May 2018 00:40:51 +0200 Subject: [PATCH 4/5] Remove broken queries --- ...not-in-unit-tests-multi-column-literal.sql | 15 +------ ...in-unit-tests-multi-column-literal.sql.out | 44 ++++--------------- 2 files changed, 11 insertions(+), 48 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql index fd4518e2166e7..3723a719315c0 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql @@ -20,19 +20,8 @@ SELECT * FROM m WHERE (a, b) NOT IN ((CAST (null AS INT), CAST (null AS DECIMAL(2, 1)))); - -- Case 3 - -- (probe-side columns are all null -> row not returned) -SELECT * -FROM m -WHERE a IS NULL AND b IS NULL -- Matches only (null, null) - AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); - - -- Case 4 - -- (one column null, other column matches a row in the subquery result -> row not returned) -SELECT * -FROM m -WHERE b = 1.0 -- Matches (null, 1.0) - AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); +-- Cases 3 and 4 are currently broken, so I have commented them out here. +-- Filed https://issues.apache.org/jira/browse/SPARK-24395 to fix and restore these test cases. -- Case 5 -- (one null column with no match -> row is returned) diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out index 737fdfad901f7..deb702126b21c 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 7 +-- Number of queries: 5 -- !query 0 @@ -33,65 +33,39 @@ NULL 1 -- !query 2 --- Case 3 - -- (probe-side columns are all null -> row not returned) -SELECT * -FROM m -WHERE a IS NULL AND b IS NULL -- Matches only (null, null) - AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))) --- !query 2 schema -struct --- !query 2 output -NULL NULL - - --- !query 3 --- Case 4 - -- (one column null, other column matches a row in the subquery result -> row not returned) -SELECT * -FROM m -WHERE b = 1.0 -- Matches (null, 1.0) - AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))) --- !query 3 schema -struct --- !query 3 output -NULL 1 - - --- !query 4 -- Case 5 -- (one null column with no match -> row is returned) SELECT * FROM m WHERE b = 1.0 -- Matches (null, 1.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 4 schema +-- !query 2 schema struct --- !query 4 output +-- !query 2 output NULL 1 --- !query 5 +-- !query 3 -- Case 6 -- (no null columns with match -> row not returned) SELECT * FROM m WHERE b = 3.0 -- Matches (2, 3.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 5 schema +-- !query 3 schema struct --- !query 5 output +-- !query 3 output --- !query 6 +-- !query 4 -- Case 7 -- (no null columns with no match -> row is returned) SELECT * FROM m WHERE b = 5.0 -- Matches (4, 5.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 6 schema +-- !query 4 schema struct --- !query 6 output +-- !query 4 output 4 5 From 8bb8963bb6e9af0fff5cab33c1e366267e8b7eaf Mon Sep 17 00:00:00 2001 From: Miles Yucht Date: Sat, 26 May 2018 09:31:51 +0200 Subject: [PATCH 5/5] Remove one last broken query --- ...not-in-unit-tests-multi-column-literal.sql | 18 +++------- ...in-unit-tests-multi-column-literal.sql.out | 35 +++++-------------- 2 files changed, 14 insertions(+), 39 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql index 3723a719315c0..8eea84f4f5272 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql @@ -11,16 +11,10 @@ CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES (4, 5.0) AS m(a, b); - -- Case 1 (not possible to write a literal with no rows, so we ignore it.) - -- (subquery is empty -> row is returned) +-- Case 1 (not possible to write a literal with no rows, so we ignore it.) +-- (subquery is empty -> row is returned) - -- Case 2 - -- (subquery contains a row with null in all columns -> row not returned) -SELECT * -FROM m -WHERE (a, b) NOT IN ((CAST (null AS INT), CAST (null AS DECIMAL(2, 1)))); - --- Cases 3 and 4 are currently broken, so I have commented them out here. +-- Cases 2, 3 and 4 are currently broken, so I have commented them out here. -- Filed https://issues.apache.org/jira/browse/SPARK-24395 to fix and restore these test cases. -- Case 5 @@ -35,13 +29,11 @@ WHERE b = 1.0 -- Matches (null, 1.0) SELECT * FROM m WHERE b = 3.0 -- Matches (2, 3.0) - AND (a, b) NOT IN ((2, 3.0)) -; + AND (a, b) NOT IN ((2, 3.0)); -- Case 7 -- (no null columns with no match -> row is returned) SELECT * FROM m WHERE b = 5.0 -- Matches (4, 5.0) - AND (a, b) NOT IN ((2, 3.0)) -; + AND (a, b) NOT IN ((2, 3.0)); diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out index deb702126b21c..a16e98af9a417 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 5 +-- Number of queries: 4 -- !query 0 @@ -16,56 +16,39 @@ struct<> -- !query 1 --- Case 1 (not possible to write a literal with no rows, so we ignore it.) - -- (subquery is empty -> row is returned) - - -- Case 2 - -- (subquery contains a row with null in all columns -> row not returned) -SELECT * -FROM m -WHERE (a, b) NOT IN ((CAST (null AS INT), CAST (null AS DECIMAL(2, 1)))) --- !query 1 schema -struct --- !query 1 output -2 3 -4 5 -NULL 1 - - --- !query 2 -- Case 5 -- (one null column with no match -> row is returned) SELECT * FROM m WHERE b = 1.0 -- Matches (null, 1.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 2 schema +-- !query 1 schema struct --- !query 2 output +-- !query 1 output NULL 1 --- !query 3 +-- !query 2 -- Case 6 -- (no null columns with match -> row not returned) SELECT * FROM m WHERE b = 3.0 -- Matches (2, 3.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 3 schema +-- !query 2 schema struct --- !query 3 output +-- !query 2 output --- !query 4 +-- !query 3 -- Case 7 -- (no null columns with no match -> row is returned) SELECT * FROM m WHERE b = 5.0 -- Matches (4, 5.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 4 schema +-- !query 3 schema struct --- !query 4 output +-- !query 3 output 4 5