diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 76df66b9de50c..c6cb52d29344b 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -224,6 +224,8 @@ license: | - Since Spark 3.0, when casting string value to integral types(tinyint, smallint, int and bigint), datetime types(date, timestamp and interval) and boolean type, the leading and trailing whitespaces (<= ASCII 32) will be trimmed before converted to these type values, e.g. `cast(' 1\t' as int)` results `1`, `cast(' 1\t' as boolean)` results `true`, `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, while casting string to integrals and booleans, it will not trim the whitespaces from both ends, the foregoing results will be `null`, while to datetimes, only the trailing spaces (= ASCII 32) will be removed. + - Since Spark 3.0, An analysis exception will be thrown when hash expressions are applied on elements of MapType. To restore the behavior before Spark 3.0, set `spark.sql.legacy.useHashOnMapType` to true. + - Since Spark 3.0, numbers written in scientific notation(e.g. `1E2`) would be parsed as Double. In Spark version 2.4 and earlier, they're parsed as Decimal. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.exponentLiteralAsDecimal.enabled` to `true`. - Since Spark 3.0, we pad decimal numbers with trailing zeros to the scale of the column for `spark-sql` interface, for example: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 52429a63b306d..14d5f25d42d0d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} import org.apache.spark.sql.catalyst.util.DateTimeConstants._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.hash.Murmur3_x86_32 @@ -232,9 +233,6 @@ case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInp * - array: The `result` starts with seed, then use `result` as seed, recursively * calculate hash value for each element, and assign the element hash value * to `result`. - * - map: The `result` starts with seed, then use `result` as seed, recursively - * calculate hash value for each key-value, and assign the key-value hash - * value to `result`. * - struct: The `result` starts with seed, then use `result` as seed, recursively * calculate hash value for each field, and assign the field hash value to * `result`. @@ -249,10 +247,21 @@ abstract class HashExpression[E] extends Expression { override def nullable: Boolean = false + private def hasMapType(dt: DataType): Boolean = { + dt.existsRecursively(_.isInstanceOf[MapType]) + } + override def checkInputDataTypes(): TypeCheckResult = { if (children.length < 1) { TypeCheckResult.TypeCheckFailure( s"input to function $prettyName requires at least one argument") + } else if (children.exists(child => hasMapType(child.dataType)) && + !SQLConf.get.getConf(SQLConf.LEGACY_USE_HASH_ON_MAPTYPE)) { + TypeCheckResult.TypeCheckFailure( + s"input to function $prettyName cannot contain elements of MapType. In Spark, same maps " + + "may have different hashcode, thus hash expressions are prohibited on MapType " + + s"elements. To restore previous behavior set ${SQLConf.LEGACY_USE_HASH_ON_MAPTYPE.key} " + + "to true.") } else { TypeCheckResult.TypeCheckSuccess } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 8c5debd28ce56..424ab20d254a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2205,6 +2205,12 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_USE_HASH_ON_MAPTYPE = buildConf("spark.sql.legacy.useHashOnMapType") + .doc("When set to true, hash expressions can be applied on elements of MapType. Otherwise, " + + "an analysis exception will be thrown.") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala index e6cf979649c83..68da1faaa8f45 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala @@ -554,28 +554,14 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { .add("arrayOfString", arrayOfString) .add("arrayOfArrayOfString", ArrayType(arrayOfString)) .add("arrayOfArrayOfInt", ArrayType(ArrayType(IntegerType))) - .add("arrayOfMap", ArrayType(mapOfString)) .add("arrayOfStruct", ArrayType(structOfString)) .add("arrayOfUDT", arrayOfUDT)) - testHash( - new StructType() - .add("mapOfIntAndString", MapType(IntegerType, StringType)) - .add("mapOfStringAndArray", MapType(StringType, arrayOfString)) - .add("mapOfArrayAndInt", MapType(arrayOfString, IntegerType)) - .add("mapOfArray", MapType(arrayOfString, arrayOfString)) - .add("mapOfStringAndStruct", MapType(StringType, structOfString)) - .add("mapOfStructAndString", MapType(structOfString, StringType)) - .add("mapOfStruct", MapType(structOfString, structOfString))) - testHash( new StructType() .add("structOfString", structOfString) .add("structOfStructOfString", new StructType().add("struct", structOfString)) .add("structOfArray", new StructType().add("array", arrayOfString)) - .add("structOfMap", new StructType().add("map", mapOfString)) - .add("structOfArrayAndMap", - new StructType().add("array", arrayOfString).add("map", mapOfString)) .add("structOfUDT", structOfUDT)) test("hive-hash for decimal") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index d6efb2f91ca0b..6e6dce0bce218 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2120,6 +2120,25 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } } + test("SPARK-27619: Throw analysis exception when hash and xxhash64 is used on MapType") { + Seq("hash", "xxhash64").foreach { + case hashExpression => + intercept[AnalysisException] { + spark.createDataset(Map(1 -> 10, 2 -> 20) :: Nil).selectExpr(s"$hashExpression(*)") + } + } + } + + test("SPARK-27619: when spark.sql.legacy.useHashOnMapType is true, hash can be used on Maptype") { + Seq("hash", "xxhash64").foreach { + case hashExpression => + withSQLConf(SQLConf.LEGACY_USE_HASH_ON_MAPTYPE.key -> "true") { + val df = spark.createDataset(Map() :: Nil) + checkAnswer(df.selectExpr(s"$hashExpression(*)"), sql(s"SELECT $hashExpression(map())")) + } + } + } + test("xxhash64 function") { val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j") withTempView("tbl") {