-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-6638] [SQL] Improve performance of StringType in SQL #5350
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
685fd07
21f67c6
4699c3a
d32abd1
a85fb27
6b499ac
5f9e120
38c303e
c7dd4d2
bb52e44
8b45864
23a766c
9dc32d1
73e4363
956b0a4
9f4c194
537631c
28d6f32
28f3d81
e5fa5b8
8d17f21
fd11364
ac18ae6
2089d24
13d9d42
867bf50
1314a37
5116b43
08d897b
b04a19c
744788f
341ec2c
59025c8
6d776a9
2772f0d
3b7bfa8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,6 @@ import java.sql.{Date, Timestamp} | |
| import java.text.{DateFormat, SimpleDateFormat} | ||
|
|
||
| import org.apache.spark.Logging | ||
| import org.apache.spark.sql.catalyst.errors.TreeNodeException | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
| /** Cast the child expression to the target data type. */ | ||
|
|
@@ -112,21 +111,21 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
|
|
||
| // UDFToString | ||
| private[this] def castToString(from: DataType): Any => Any = from match { | ||
| case BinaryType => buildCast[Array[Byte]](_, new String(_, "UTF-8")) | ||
| case DateType => buildCast[Int](_, d => DateUtils.toString(d)) | ||
| case TimestampType => buildCast[Timestamp](_, timestampToString) | ||
| case _ => buildCast[Any](_, _.toString) | ||
| case BinaryType => buildCast[Array[Byte]](_, UTF8String(_)) | ||
| case DateType => buildCast[Int](_, d => UTF8String(DateUtils.toString(d))) | ||
| case TimestampType => buildCast[Timestamp](_, t => UTF8String(timestampToString(t))) | ||
| case _ => buildCast[Any](_, o => UTF8String(o.toString)) | ||
| } | ||
|
|
||
| // BinaryConverter | ||
| private[this] def castToBinary(from: DataType): Any => Any = from match { | ||
| case StringType => buildCast[String](_, _.getBytes("UTF-8")) | ||
| case StringType => buildCast[UTF8String](_, _.getBytes) | ||
| } | ||
|
|
||
| // UDFToBoolean | ||
| private[this] def castToBoolean(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, _.length() != 0) | ||
| buildCast[UTF8String](_, _.length() != 0) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => t.getTime() != 0 || t.getNanos() != 0) | ||
| case DateType => | ||
|
|
@@ -151,8 +150,9 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // TimestampConverter | ||
| private[this] def castToTimestamp(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => { | ||
| buildCast[UTF8String](_, utfs => { | ||
| // Throw away extra if more than 9 decimal places | ||
| val s = utfs.toString | ||
| val periodIdx = s.indexOf(".") | ||
| var n = s | ||
| if (periodIdx != -1 && n.length() - periodIdx > 9) { | ||
|
|
@@ -227,8 +227,8 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // DateConverter | ||
| private[this] def castToDate(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => | ||
| try DateUtils.fromJavaDate(Date.valueOf(s)) | ||
| buildCast[UTF8String](_, s => | ||
| try DateUtils.fromJavaDate(Date.valueOf(s.toString)) | ||
| catch { case _: java.lang.IllegalArgumentException => null } | ||
| ) | ||
| case TimestampType => | ||
|
|
@@ -245,7 +245,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // LongConverter | ||
| private[this] def castToLong(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toLong catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toLong catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -261,7 +261,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // IntConverter | ||
| private[this] def castToInt(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toInt catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toInt catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -277,7 +277,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // ShortConverter | ||
| private[this] def castToShort(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toShort catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toShort catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -293,7 +293,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // ByteConverter | ||
| private[this] def castToByte(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toByte catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toByte catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -323,7 +323,9 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
|
|
||
| private[this] def castToDecimal(from: DataType, target: DecimalType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try changePrecision(Decimal(s.toDouble), target) catch { | ||
| buildCast[UTF8String](_, s => try { | ||
| changePrecision(Decimal(s.toString.toDouble), target) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite related to your change. But, why we convert the string to double first?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess Double is the wide format and range for numbers, or we need to have a special parser for it. |
||
| } catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -348,7 +350,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // DoubleConverter | ||
| private[this] def castToDouble(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toDouble catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toDouble catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -364,7 +366,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // FloatConverter | ||
| private[this] def castToFloat(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toFloat catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toFloat catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -230,13 +230,17 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR | |
| new GenericRow(newValues) | ||
| } | ||
|
|
||
| override def update(ordinal: Int, value: Any): Unit = { | ||
| if (value == null) setNullAt(ordinal) else values(ordinal).update(value) | ||
| override def update(ordinal: Int, value: Any) { | ||
| if (value == null) { | ||
| setNullAt(ordinal) | ||
| } else { | ||
| values(ordinal).update(value) | ||
| } | ||
| } | ||
|
|
||
| override def setString(ordinal: Int, value: String): Unit = update(ordinal, value) | ||
| override def setString(ordinal: Int, value: String): Unit = update(ordinal, UTF8String(value)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why we are still expecting a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an API, so we should keep it. |
||
|
|
||
| override def getString(ordinal: Int): String = apply(ordinal).asInstanceOf[String] | ||
| override def getString(ordinal: Int): String = apply(ordinal).toString | ||
|
|
||
| override def setInt(ordinal: Int, value: Int): Unit = { | ||
| val currentValue = values(ordinal).asInstanceOf[MutableInt] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -216,10 +216,11 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| val $primitiveTerm: ${termForType(dataType)} = $value | ||
| """.children | ||
|
|
||
| case expressions.Literal(value: String, dataType) => | ||
| case expressions.Literal(value: UTF8String, dataType) => | ||
| q""" | ||
| val $nullTerm = ${value == null} | ||
| val $primitiveTerm: ${termForType(dataType)} = $value | ||
| val $primitiveTerm: ${termForType(dataType)} = | ||
| org.apache.spark.sql.types.UTF8String(${value.getBytes}) | ||
| """.children | ||
|
|
||
| case expressions.Literal(value: Int, dataType) => | ||
|
|
@@ -243,11 +244,14 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| if($nullTerm) | ||
| ${defaultPrimitive(StringType)} | ||
| else | ||
| new String(${eval.primitiveTerm}.asInstanceOf[Array[Byte]]) | ||
| org.apache.spark.sql.types.UTF8String(${eval.primitiveTerm}.asInstanceOf[Array[Byte]]) | ||
| """.children | ||
|
|
||
| case Cast(child @ DateType(), StringType) => | ||
| child.castOrNull(c => q"org.apache.spark.sql.types.DateUtils.toString($c)", StringType) | ||
| child.castOrNull(c => | ||
| q"""org.apache.spark.sql.types.UTF8String( | ||
| org.apache.spark.sql.types.DateUtils.toString($c))""", | ||
| StringType) | ||
|
|
||
| case Cast(child @ NumericType(), IntegerType) => | ||
| child.castOrNull(c => q"$c.toInt", IntegerType) | ||
|
|
@@ -272,9 +276,18 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| if($nullTerm) | ||
| ${defaultPrimitive(StringType)} | ||
| else | ||
| ${eval.primitiveTerm}.toString | ||
| org.apache.spark.sql.types.UTF8String(${eval.primitiveTerm}.toString) | ||
| """.children | ||
|
|
||
| case EqualTo(e1: BinaryType, e2: BinaryType) => | ||
| (e1, e2).evaluateAs (BooleanType) { | ||
| case (eval1, eval2) => | ||
| q""" | ||
| java.util.Arrays.equals($eval1.asInstanceOf[Array[Byte]], | ||
| $eval2.asInstanceOf[Array[Byte]]) | ||
| """ | ||
| } | ||
|
|
||
| case EqualTo(e1, e2) => | ||
| (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 == $eval2" } | ||
|
|
||
|
|
@@ -597,7 +610,8 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| val localLogger = log | ||
| val localLoggerTree = reify { localLogger } | ||
| q""" | ||
| $localLoggerTree.debug(${e.toString} + ": " + (if($nullTerm) "null" else $primitiveTerm)) | ||
| $localLoggerTree.debug( | ||
| ${e.toString} + ": " + (if ($nullTerm) "null" else $primitiveTerm.toString)) | ||
| """ :: Nil | ||
| } else { | ||
| Nil | ||
|
|
@@ -608,6 +622,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
|
|
||
| protected def getColumn(inputRow: TermName, dataType: DataType, ordinal: Int) = { | ||
| dataType match { | ||
| case StringType => q"$inputRow($ordinal).asInstanceOf[org.apache.spark.sql.types.UTF8String]" | ||
| case dt @ NativeType() => q"$inputRow.${accessorForType(dt)}($ordinal)" | ||
| case _ => q"$inputRow.apply($ordinal).asInstanceOf[${termForType(dataType)}]" | ||
| } | ||
|
|
@@ -619,6 +634,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| ordinal: Int, | ||
| value: TermName) = { | ||
| dataType match { | ||
| case StringType => q"$destinationRow.update($ordinal, $value)" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems this one is needed because
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. |
||
| case dt @ NativeType() => q"$destinationRow.${mutatorForType(dt)}($ordinal, $value)" | ||
| case _ => q"$destinationRow.update($ordinal, $value)" | ||
| } | ||
|
|
@@ -642,13 +658,13 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| case DoubleType => "Double" | ||
| case FloatType => "Float" | ||
| case BooleanType => "Boolean" | ||
| case StringType => "String" | ||
| case StringType => "org.apache.spark.sql.types.UTF8String" | ||
| } | ||
|
|
||
| protected def defaultPrimitive(dt: DataType) = dt match { | ||
| case BooleanType => ru.Literal(Constant(false)) | ||
| case FloatType => ru.Literal(Constant(-1.0.toFloat)) | ||
| case StringType => ru.Literal(Constant("<uninit>")) | ||
| case StringType => q"""org.apache.spark.sql.types.UTF8String("<uninit>")""" | ||
| case ShortType => ru.Literal(Constant(-1.toShort)) | ||
| case LongType => ru.Literal(Constant(-1L)) | ||
| case ByteType => ru.Literal(Constant(-1.toByte)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we excepting a
Dateobject or itsIntrepresentation? For internal use, I guess we expect an int. But, since users can also call it, for them, aDateobject is expected, right?Right now, if I call
getDate, I will get aClassCastException?Can you file a jira for it for 1.4 and mark it as a blocker?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://issues.apache.org/jira/browse/SPARK-6784