Skip to content

Commit 94f9227

Browse files
kiszkcloud-fan
authored andcommitted
[SPARK-22550][SQL] Fix 64KB JVM bytecode limit problem with elt
This PR changes `elt` code generation to place generated code for expression for arguments into separated methods if these size could be large. This PR resolved the case of `elt` with a lot of argument Added new test cases into `StringExpressionsSuite` Author: Kazuaki Ishizaki <[email protected]> Closes #19778 from kiszk/SPARK-22550. (cherry picked from commit 9bdff0b) Signed-off-by: Wenchen Fan <[email protected]>
1 parent 23eb4d7 commit 94f9227

File tree

3 files changed

+70
-23
lines changed

3 files changed

+70
-23
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -660,20 +660,7 @@ class CodegenContext {
660660
returnType: String = "void",
661661
makeSplitFunction: String => String = identity,
662662
foldFunctions: Seq[String] => String = _.mkString("", ";\n", ";")): String = {
663-
val blocks = new ArrayBuffer[String]()
664-
val blockBuilder = new StringBuilder()
665-
for (code <- expressions) {
666-
// We can't know how many bytecode will be generated, so use the length of source code
667-
// as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
668-
// also not be too small, or it will have many function calls (for wide table), see the
669-
// results in BenchmarkWideTable.
670-
if (blockBuilder.length > 1024) {
671-
blocks += blockBuilder.toString()
672-
blockBuilder.clear()
673-
}
674-
blockBuilder.append(code)
675-
}
676-
blocks += blockBuilder.toString()
663+
val blocks = buildCodeBlocks(expressions)
677664

678665
if (blocks.length == 1) {
679666
// inline execution if only one block
@@ -696,6 +683,29 @@ class CodegenContext {
696683
}
697684
}
698685

686+
/**
687+
* Splits the generated code of expressions into multiple sequences of String
688+
* based on a threshold of length of a String
689+
*
690+
* @param expressions the codes to evaluate expressions.
691+
*/
692+
def buildCodeBlocks(expressions: Seq[String]): Seq[String] = {
693+
val blocks = new ArrayBuffer[String]()
694+
val blockBuilder = new StringBuilder()
695+
for (code <- expressions) {
696+
// We can't know how many bytecode will be generated, so use the length of source code
697+
// as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
698+
// also not be too small, or it will have many function calls (for wide table), see the
699+
// results in BenchmarkWideTable.
700+
if (blockBuilder.length > 1024) {
701+
blocks += blockBuilder.toString()
702+
blockBuilder.clear()
703+
}
704+
blockBuilder.append(code)
705+
}
706+
blocks += blockBuilder.toString()
707+
}
708+
699709
/**
700710
* Wrap the generated code of expression, which was created from a row object in INPUT_ROW,
701711
* by a function. ev.isNull and ev.value are passed by global variables

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -287,22 +287,52 @@ case class Elt(children: Seq[Expression])
287287
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
288288
val index = indexExpr.genCode(ctx)
289289
val strings = stringExprs.map(_.genCode(ctx))
290+
val indexVal = ctx.freshName("index")
291+
val stringVal = ctx.freshName("stringVal")
290292
val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
291293
s"""
292294
case ${index + 1}:
293-
${ev.value} = ${eval.isNull} ? null : ${eval.value};
295+
${eval.code}
296+
$stringVal = ${eval.isNull} ? null : ${eval.value};
294297
break;
295298
"""
296-
}.mkString("\n")
297-
val indexVal = ctx.freshName("index")
298-
val stringArray = ctx.freshName("strings");
299+
}
299300

300-
ev.copy(index.code + "\n" + strings.map(_.code).mkString("\n") + s"""
301-
final int $indexVal = ${index.value};
302-
UTF8String ${ev.value} = null;
303-
switch ($indexVal) {
304-
$assignStringValue
301+
val cases = ctx.buildCodeBlocks(assignStringValue)
302+
val codes = if (cases.length == 1) {
303+
s"""
304+
UTF8String $stringVal = null;
305+
switch ($indexVal) {
306+
${cases.head}
307+
}
308+
"""
309+
} else {
310+
var prevFunc = "null"
311+
for (c <- cases.reverse) {
312+
val funcName = ctx.freshName("eltFunc")
313+
val funcBody = s"""
314+
private UTF8String $funcName(InternalRow ${ctx.INPUT_ROW}, int $indexVal) {
315+
UTF8String $stringVal = null;
316+
switch ($indexVal) {
317+
$c
318+
default:
319+
return $prevFunc;
320+
}
321+
return $stringVal;
322+
}
323+
"""
324+
ctx.addNewFunction(funcName, funcBody)
325+
prevFunc = s"$funcName(${ctx.INPUT_ROW}, $indexVal)"
305326
}
327+
s"UTF8String $stringVal = $prevFunc;"
328+
}
329+
330+
ev.copy(
331+
s"""
332+
${index.code}
333+
final int $indexVal = ${index.value};
334+
$codes
335+
UTF8String ${ev.value} = $stringVal;
306336
final boolean ${ev.isNull} = ${ev.value} == null;
307337
""")
308338
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
117117
assert(Elt(Seq(Literal(1), Literal(2))).checkInputDataTypes().isFailure)
118118
}
119119

120+
test("SPARK-22550: Elt should not generate codes beyond 64KB") {
121+
val N = 10000
122+
val strings = (1 to N).map(x => s"s$x")
123+
val args = Literal.create(N, IntegerType) +: strings.map(Literal.create(_, StringType))
124+
checkEvaluation(Elt(args), s"s$N")
125+
}
126+
120127
test("StringComparison") {
121128
val row = create_row("abc", null)
122129
val c1 = 'a.string.at(0)

0 commit comments

Comments
 (0)