-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-22682][SQL] HashExpression does not need to create global variables #19878
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -270,17 +270,36 @@ abstract class HashExpression[E] extends Expression { | |
|
||
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
ev.isNull = "false" | ||
val childrenHash = ctx.splitExpressions(children.map { child => | ||
|
||
val childrenHash = children.map { child => | ||
val childGen = child.genCode(ctx) | ||
childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) { | ||
computeHash(childGen.value, child.dataType, ev.value, ctx) | ||
} | ||
}) | ||
} | ||
|
||
val hashResultType = ctx.javaType(dataType) | ||
val codes = if (ctx.INPUT_ROW == null || ctx.currentVars != null) { | ||
childrenHash.mkString("\n") | ||
} else { | ||
ctx.splitExpressions( | ||
expressions = childrenHash, | ||
funcName = "computeHash", | ||
arguments = Seq("InternalRow" -> ctx.INPUT_ROW, hashResultType -> ev.value), | ||
returnType = hashResultType, | ||
makeSplitFunction = body => | ||
s""" | ||
|$body | ||
|return ${ev.value}; | ||
""".stripMargin, | ||
foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n")) | ||
} | ||
|
||
ctx.addMutableState(ctx.javaType(dataType), ev.value) | ||
ev.copy(code = s""" | ||
${ev.value} = $seed; | ||
$childrenHash""") | ||
ev.copy(code = | ||
s""" | ||
|$hashResultType ${ev.value} = $seed; | ||
|$codes | ||
""".stripMargin) | ||
} | ||
|
||
protected def nullSafeElementHash( | ||
|
@@ -389,13 +408,21 @@ abstract class HashExpression[E] extends Expression { | |
input: String, | ||
result: String, | ||
fields: Array[StructField]): String = { | ||
val hashes = fields.zipWithIndex.map { case (field, index) => | ||
val fieldsHash = fields.zipWithIndex.map { case (field, index) => | ||
nullSafeElementHash(input, index.toString, field.nullable, field.dataType, result, ctx) | ||
} | ||
val hashResultType = ctx.javaType(dataType) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: this is done also in line 281. Can we do this only once? maybe with a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
ctx.splitExpressions( | ||
expressions = hashes, | ||
funcName = "getHash", | ||
arguments = Seq("InternalRow" -> input)) | ||
expressions = fieldsHash, | ||
funcName = "computeHashForStruct", | ||
arguments = Seq("InternalRow" -> input, hashResultType -> result), | ||
returnType = hashResultType, | ||
makeSplitFunction = body => | ||
s""" | ||
|$body | ||
|return $result; | ||
""".stripMargin, | ||
foldFunctions = _.map(funcCall => s"$result = $funcCall;").mkString("\n")) | ||
} | ||
|
||
@tailrec | ||
|
@@ -610,25 +637,44 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] { | |
|
||
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
ev.isNull = "false" | ||
|
||
val childHash = ctx.freshName("childHash") | ||
val childrenHash = ctx.splitExpressions(children.map { child => | ||
val childrenHash = children.map { child => | ||
val childGen = child.genCode(ctx) | ||
val codeToComputeHash = ctx.nullSafeExec(child.nullable, childGen.isNull) { | ||
computeHash(childGen.value, child.dataType, childHash, ctx) | ||
} | ||
s""" | ||
|${childGen.code} | ||
|$childHash = 0; | ||
|$codeToComputeHash | ||
|${ev.value} = (31 * ${ev.value}) + $childHash; | ||
|$childHash = 0; | ||
""".stripMargin | ||
}) | ||
} | ||
|
||
ctx.addMutableState(ctx.javaType(dataType), ev.value) | ||
ctx.addMutableState(ctx.JAVA_INT, childHash, s"$childHash = 0;") | ||
ev.copy(code = s""" | ||
${ev.value} = $seed; | ||
$childrenHash""") | ||
val codes = if (ctx.INPUT_ROW == null || ctx.currentVars != null) { | ||
childrenHash.mkString("\n") | ||
} else { | ||
ctx.splitExpressions( | ||
expressions = childrenHash, | ||
funcName = "computeHash", | ||
arguments = Seq("InternalRow" -> ctx.INPUT_ROW, ctx.JAVA_INT -> ev.value), | ||
returnType = ctx.JAVA_INT, | ||
makeSplitFunction = body => | ||
s""" | ||
|${ctx.JAVA_INT} $childHash = 0; | ||
|$body | ||
|return ${ev.value}; | ||
""".stripMargin, | ||
foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n")) | ||
} | ||
|
||
ev.copy(code = | ||
s""" | ||
|${ctx.JAVA_INT} ${ev.value} = $seed; | ||
|${ctx.JAVA_INT} $childHash = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nvm, |
||
|$codes | ||
""".stripMargin) | ||
} | ||
|
||
override def eval(input: InternalRow = null): Int = { | ||
|
@@ -730,23 +776,29 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] { | |
input: String, | ||
result: String, | ||
fields: Array[StructField]): String = { | ||
val localResult = ctx.freshName("localResult") | ||
val childResult = ctx.freshName("childResult") | ||
fields.zipWithIndex.map { case (field, index) => | ||
val fieldsHash = fields.zipWithIndex.map { case (field, index) => | ||
val computeFieldHash = nullSafeElementHash( | ||
input, index.toString, field.nullable, field.dataType, childResult, ctx) | ||
s""" | ||
$childResult = 0; | ||
${nullSafeElementHash(input, index.toString, field.nullable, field.dataType, | ||
childResult, ctx)} | ||
$localResult = (31 * $localResult) + $childResult; | ||
""" | ||
}.mkString( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We forgot to split the code for computing hive hash of struct, it's fixed now. |
||
s""" | ||
int $localResult = 0; | ||
int $childResult = 0; | ||
""", | ||
"", | ||
s"$result = (31 * $result) + $localResult;" | ||
) | ||
|$childResult = 0; | ||
|$computeFieldHash | ||
|$result = (31 * $result) + $childResult; | ||
""".stripMargin | ||
} | ||
|
||
s"${ctx.JAVA_INT} $childResult = 0;\n" + ctx.splitExpressions( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, the input here is a row that may be produced by |
||
expressions = fieldsHash, | ||
funcName = "computeHashForStruct", | ||
arguments = Seq("InternalRow" -> input, ctx.JAVA_INT -> result), | ||
returnType = ctx.JAVA_INT, | ||
makeSplitFunction = body => | ||
s""" | ||
|${ctx.JAVA_INT} $childResult = 0; | ||
|$body | ||
|return $result; | ||
""".stripMargin, | ||
foldFunctions = _.map(funcCall => s"$result = $funcCall;").mkString("\n")) | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This pattern appears many times in the code base, we may need to create a
ctx.splitExpressionsWithCurrentInput
for it later.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think @kiszk is doing this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That one has been merged, but this one is still different.