Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,48 @@ abstract class Expression extends TreeNode[Expression] {
}.getOrElse {
val isNull = ctx.freshName("isNull")
val value = ctx.freshName("value")
val ve = doGenCode(ctx, ExprCode("", isNull, value))
if (ve.code.nonEmpty) {
val eval = doGenCode(ctx, ExprCode("", isNull, value))
reduceCodeSize(ctx, eval)
if (eval.code.nonEmpty) {
// Add `this` in the comment.
ve.copy(code = s"${ctx.registerComment(this.toString)}\n" + ve.code.trim)
eval.copy(code = s"${ctx.registerComment(this.toString)}\n" + eval.code.trim)
} else {
ve
eval
}
}
}

private def reduceCodeSize(ctx: CodegenContext, eval: ExprCode): Unit = {
// TODO: support whole stage codegen too
if (eval.code.trim.length > 1024 && ctx.INPUT_ROW != null && ctx.currentVars == null) {
val setIsNull = if (eval.isNull != "false" && eval.isNull != "true") {
val globalIsNull = ctx.freshName("globalIsNull")
ctx.addMutableState(ctx.JAVA_BOOLEAN, globalIsNull)
val localIsNull = eval.isNull
eval.isNull = globalIsNull
s"$globalIsNull = $localIsNull;"
} else {
""
}

val javaType = ctx.javaType(dataType)
val newValue = ctx.freshName("value")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed? I think we can use eval.value instead of it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ev.value may be a global variable and here we need a local variable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we strictly need a local variable here? Can't we simply assign ev.value to the generated function return value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then how are we going to change this?
eval.code = s"$javaType $newValue = $funcFullName(${ctx.INPUT_ROW});"

Saving a local variable is nothing and I think we shouldn't complicate the code(check if a variable is global) because of this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, do you mean just do eval.value = s"$funcFullName(${ctx.INPUT_ROW})"? Let me try

Copy link
Contributor

@mgaido91 mgaido91 Nov 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant:

eval.code = s"${eval.value} = $funcFullName(${ctx.INPUT_ROW});"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this won't work because ${eval.value} is not declared if it's not a global variable. I went with

eval.code = ""
eval.value = s"$funcFullName(${ctx.INPUT_ROW})"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, sorry, you're right. Then I think your previous solution is better: in this way if eval.value is used multiple times we are recomputing the function every time, thus your original implementation was fine, sorry for the bad comment.


val funcName = ctx.freshName(nodeName)
val funcFullName = ctx.addNewFunction(funcName,
s"""
|private $javaType $funcName(InternalRow ${ctx.INPUT_ROW}) {
| ${eval.code.trim}
| $setIsNull
| return ${eval.value};
|}
""".stripMargin)

eval.value = newValue
eval.code = s"$javaType $newValue = $funcFullName(${ctx.INPUT_ROW});"
}
}

/**
* Returns Java source code that can be compiled to evaluate this expression.
* The default behavior is to call the eval method of the expression. Concrete expression
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -930,36 +930,6 @@ class CodegenContext {
}
}

/**
* Wrap the generated code of expression, which was created from a row object in INPUT_ROW,
* by a function. ev.isNull and ev.value are passed by global variables
*
* @param ev the code to evaluate expressions.
* @param dataType the data type of ev.value.
* @param baseFuncName the split function name base.
*/
def createAndAddFunction(
ev: ExprCode,
dataType: DataType,
baseFuncName: String): (String, String, String) = {
val globalIsNull = freshName("isNull")
addMutableState(JAVA_BOOLEAN, globalIsNull, s"$globalIsNull = false;")
val globalValue = freshName("value")
addMutableState(javaType(dataType), globalValue,
s"$globalValue = ${defaultValue(dataType)};")
val funcName = freshName(baseFuncName)
val funcBody =
s"""
|private void $funcName(InternalRow ${INPUT_ROW}) {
| ${ev.code.trim}
| $globalIsNull = ${ev.isNull};
| $globalValue = ${ev.value};
|}
""".stripMargin
val fullFuncName = addNewFunction(funcName, funcBody)
(fullFuncName, globalIsNull, globalValue)
}

/**
* Perform a function which generates a sequence of ExprCodes with a given mapping between
* expressions and common expressions, instead of using the mapping in current context.
Expand Down Expand Up @@ -1065,7 +1035,8 @@ class CodegenContext {
* elimination will be performed. Subexpression elimination assumes that the code for each
* expression will be combined in the `expressions` order.
*/
def generateExpressions(expressions: Seq[Expression],
def generateExpressions(
expressions: Seq[Expression],
doSubexpressionElimination: Boolean = false): Seq[ExprCode] = {
if (doSubexpressionElimination) subexpressionElimination(expressions)
expressions.map(e => e.genCode(this))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,52 +64,22 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
val trueEval = trueValue.genCode(ctx)
val falseEval = falseValue.genCode(ctx)

// place generated code of condition, true value and false value in separate methods if
// their code combined is large
val combinedLength = condEval.code.length + trueEval.code.length + falseEval.code.length
Copy link
Member

@viirya viirya Nov 19, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I think this removed part is orthogonal to what this PR did. Even condition, true, and false expressions are not more than threshold individually, their combination is still more than the threshold. We will pack them into a big method after this PR.

This PR deals the oversize gen'd codes in deeply nested expressions, not oversize combination of codes from the children.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already explained it in #19767 (comment)

Mostly it's ok because the threshold is just an estimation, not a big deal to make it 2 times larger. CASE WHEN may be a problem and we can evaluate it in #18641 after this PR gets merged.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two problems I think for this. One is even the two childs' code don't exceed the threshold individually, a method not over 64k but over 8k is still big and bad for JIT. One is we estimate it with code length, I'm not sure if two 1000 length childs won't definitely generate 64k method in the end.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no way to guarantee it with the current string based codegen framework, even without this PR. 1000 length code may also generate 64kb byte code in the end.

1024 is not a good estimation at all, kind of random to me. So multiplying it with 2 doesn't seem a big issue. CASE WHEN may have issue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW if it's really an issue, we can add splitting logic in non-leaf/non-unary nodes. This is much less work than before because: 1. no need to care about unary nodes 2. the splitting logic can be simpler because all children are guaranteed to generate less than 1000 LOC.

val generatedCode = if (combinedLength > 1024 &&
// Split these expressions only if they are created from a row object
(ctx.INPUT_ROW != null && ctx.currentVars == null)) {

val (condFuncName, condGlobalIsNull, condGlobalValue) =
ctx.createAndAddFunction(condEval, predicate.dataType, "evalIfCondExpr")
val (trueFuncName, trueGlobalIsNull, trueGlobalValue) =
ctx.createAndAddFunction(trueEval, trueValue.dataType, "evalIfTrueExpr")
val (falseFuncName, falseGlobalIsNull, falseGlobalValue) =
ctx.createAndAddFunction(falseEval, falseValue.dataType, "evalIfFalseExpr")
val code =
s"""
$condFuncName(${ctx.INPUT_ROW});
boolean ${ev.isNull} = false;
${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
if (!$condGlobalIsNull && $condGlobalValue) {
$trueFuncName(${ctx.INPUT_ROW});
${ev.isNull} = $trueGlobalIsNull;
${ev.value} = $trueGlobalValue;
} else {
$falseFuncName(${ctx.INPUT_ROW});
${ev.isNull} = $falseGlobalIsNull;
${ev.value} = $falseGlobalValue;
}
"""
}
else {
s"""
${condEval.code}
boolean ${ev.isNull} = false;
${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
if (!${condEval.isNull} && ${condEval.value}) {
${trueEval.code}
${ev.isNull} = ${trueEval.isNull};
${ev.value} = ${trueEval.value};
} else {
${falseEval.code}
${ev.isNull} = ${falseEval.isNull};
${ev.value} = ${falseEval.value};
}
"""
}

ev.copy(code = generatedCode)
|${condEval.code}
|boolean ${ev.isNull} = false;
|${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
|if (!${condEval.isNull} && ${condEval.value}) {
| ${trueEval.code}
| ${ev.isNull} = ${trueEval.isNull};
| ${ev.value} = ${trueEval.value};
|} else {
| ${falseEval.code}
| ${ev.isNull} = ${falseEval.isNull};
| ${ev.value} = ${falseEval.value};
|}
""".stripMargin
ev.copy(code = code)
}

override def toString: String = s"if ($predicate) $trueValue else $falseValue"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,9 @@ case class Alias(child: Expression, name: String)(

/** Just a simple passthrough for code generation. */
override def genCode(ctx: CodegenContext): ExprCode = child.genCode(ctx)
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ev.copy("")
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
throw new IllegalStateException("Alias.doGenCode should not be called.")
}

override def dataType: DataType = child.dataType
override def nullable: Boolean = child.nullable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -378,46 +378,7 @@ case class And(left: Expression, right: Expression) extends BinaryOperator with
val eval2 = right.genCode(ctx)

// The result should be `false`, if any of them is `false` whenever the other is null or not.

// place generated code of eval1 and eval2 in separate methods if their code combined is large
val combinedLength = eval1.code.length + eval2.code.length
if (combinedLength > 1024 &&
// Split these expressions only if they are created from a row object
(ctx.INPUT_ROW != null && ctx.currentVars == null)) {

val (eval1FuncName, eval1GlobalIsNull, eval1GlobalValue) =
ctx.createAndAddFunction(eval1, BooleanType, "eval1Expr")
val (eval2FuncName, eval2GlobalIsNull, eval2GlobalValue) =
ctx.createAndAddFunction(eval2, BooleanType, "eval2Expr")
if (!left.nullable && !right.nullable) {
val generatedCode = s"""
$eval1FuncName(${ctx.INPUT_ROW});
boolean ${ev.value} = false;
if (${eval1GlobalValue}) {
$eval2FuncName(${ctx.INPUT_ROW});
${ev.value} = ${eval2GlobalValue};
}
"""
ev.copy(code = generatedCode, isNull = "false")
} else {
val generatedCode = s"""
$eval1FuncName(${ctx.INPUT_ROW});
boolean ${ev.isNull} = false;
boolean ${ev.value} = false;
if (!${eval1GlobalIsNull} && !${eval1GlobalValue}) {
} else {
$eval2FuncName(${ctx.INPUT_ROW});
if (!${eval2GlobalIsNull} && !${eval2GlobalValue}) {
} else if (!${eval1GlobalIsNull} && !${eval2GlobalIsNull}) {
${ev.value} = true;
} else {
${ev.isNull} = true;
}
}
"""
ev.copy(code = generatedCode)
}
} else if (!left.nullable && !right.nullable) {
if (!left.nullable && !right.nullable) {
ev.copy(code = s"""
${eval1.code}
boolean ${ev.value} = false;
Expand Down Expand Up @@ -480,46 +441,7 @@ case class Or(left: Expression, right: Expression) extends BinaryOperator with P
val eval2 = right.genCode(ctx)

// The result should be `true`, if any of them is `true` whenever the other is null or not.

// place generated code of eval1 and eval2 in separate methods if their code combined is large
val combinedLength = eval1.code.length + eval2.code.length
if (combinedLength > 1024 &&
// Split these expressions only if they are created from a row object
(ctx.INPUT_ROW != null && ctx.currentVars == null)) {

val (eval1FuncName, eval1GlobalIsNull, eval1GlobalValue) =
ctx.createAndAddFunction(eval1, BooleanType, "eval1Expr")
val (eval2FuncName, eval2GlobalIsNull, eval2GlobalValue) =
ctx.createAndAddFunction(eval2, BooleanType, "eval2Expr")
if (!left.nullable && !right.nullable) {
val generatedCode = s"""
$eval1FuncName(${ctx.INPUT_ROW});
boolean ${ev.value} = true;
if (!${eval1GlobalValue}) {
$eval2FuncName(${ctx.INPUT_ROW});
${ev.value} = ${eval2GlobalValue};
}
"""
ev.copy(code = generatedCode, isNull = "false")
} else {
val generatedCode = s"""
$eval1FuncName(${ctx.INPUT_ROW});
boolean ${ev.isNull} = false;
boolean ${ev.value} = true;
if (!${eval1GlobalIsNull} && ${eval1GlobalValue}) {
} else {
$eval2FuncName(${ctx.INPUT_ROW});
if (!${eval2GlobalIsNull} && ${eval2GlobalValue}) {
} else if (!${eval1GlobalIsNull} && !${eval2GlobalIsNull}) {
${ev.value} = false;
} else {
${ev.isNull} = true;
}
}
"""
ev.copy(code = generatedCode)
}
} else if (!left.nullable && !right.nullable) {
if (!left.nullable && !right.nullable) {
ev.isNull = "false"
ev.copy(code = s"""
${eval1.code}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
assert(actual(0) == cases)
}

test("SPARK-18091: split large if expressions into blocks due to JVM code size limit") {
test("SPARK-22543: split large if expressions into blocks due to JVM code size limit") {
var strExpr: Expression = Literal("abc")
for (_ <- 1 to 150) {
strExpr = Decode(Encode(strExpr, "utf-8"), "utf-8")
Expand Down Expand Up @@ -342,7 +342,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
projection(row)
}

test("SPARK-21720: split large predications into blocks due to JVM code size limit") {
test("SPARK-22543: split large predicates into blocks due to JVM code size limit") {
val length = 600

val input = new GenericInternalRow(length)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ case class HashAggregateExec(
private def doProduceWithoutKeys(ctx: CodegenContext): String = {
val initAgg = ctx.freshName("initAgg")
ctx.addMutableState(ctx.JAVA_BOOLEAN, initAgg, s"$initAgg = false;")
// The generated function doesn't have input row in the code context.
ctx.INPUT_ROW = null

// generate variables for aggregation buffer
val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
Expand Down