-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-17528][SQL] data should be copied properly before saving into InternalRow #18483
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,9 @@ | |
package org.apache.spark.sql.catalyst | ||
|
||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} | ||
import org.apache.spark.sql.types.{DataType, Decimal, StructType} | ||
import org.apache.spark.unsafe.types.UTF8String | ||
|
||
/** | ||
* An abstract class for row used internally in Spark SQL, which only contains the columns as | ||
|
@@ -33,6 +35,10 @@ abstract class InternalRow extends SpecializedGetters with Serializable { | |
|
||
def setNullAt(i: Int): Unit | ||
|
||
/** | ||
* Updates the value at column `i`. Note that after updating, the given value will be kept in this | ||
* row, and the caller side should guarantee that this value won't be changed afterwards. | ||
*/ | ||
def update(i: Int, value: Any): Unit | ||
|
||
// default implementation (slow) | ||
|
@@ -58,7 +64,15 @@ abstract class InternalRow extends SpecializedGetters with Serializable { | |
def copy(): InternalRow | ||
|
||
/** Returns true if there are any NULL values in this row. */ | ||
def anyNull: Boolean | ||
def anyNull: Boolean = { | ||
val len = numFields | ||
var i = 0 | ||
while (i < len) { | ||
if (isNullAt(i)) { return true } | ||
i += 1 | ||
} | ||
false | ||
} | ||
|
||
/* ---------------------- utility methods for Scala ---------------------- */ | ||
|
||
|
@@ -94,4 +108,21 @@ object InternalRow { | |
|
||
/** Returns an empty [[InternalRow]]. */ | ||
val empty = apply() | ||
|
||
/** | ||
* Copies the given value if it's string/struct/array/map type. | ||
*/ | ||
def copyValue(value: Any): Any = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we have some marker trait for objects that need to be copied? It might make the code a bit more concise, and it will also save some typing. The only downside would be that the trait will probably live in a very weird place because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. only 4 types, maybe not need to bother? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is fine for now. |
||
if (value.isInstanceOf[UTF8String]) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use pattern matching? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this method may be called many times, for nested complex type, so I'm worried about performance here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ??? - pattern matching should yield the same performance as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, I was told to not use pattern match in performance critical path... Anyway, seems the |
||
value.asInstanceOf[UTF8String].copy() | ||
} else if (value.isInstanceOf[InternalRow]) { | ||
value.asInstanceOf[InternalRow].copy() | ||
} else if (value.isInstanceOf[ArrayData]) { | ||
value.asInstanceOf[ArrayData].copy() | ||
} else if (value.isInstanceOf[MapData]) { | ||
value.asInstanceOf[MapData].copy() | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also support There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The internal values are totally internal, do we really need an esacpe door? |
||
value | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1047,7 +1047,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String | |
final $rowClass $result = new $rowClass(${fieldsCasts.length}); | ||
final InternalRow $tmpRow = $c; | ||
$fieldsEvalCode | ||
$evPrim = $result.copy(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this copy is not needed, because we already do the copy when setting columns to this row. |
||
$evPrim = $result; | ||
""" | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,8 +131,6 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] | |
case s: StructType => createCodeForStruct(ctx, input, s) | ||
case ArrayType(elementType, _) => createCodeForArray(ctx, input, elementType) | ||
case MapType(keyType, valueType, _) => createCodeForMap(ctx, input, keyType, valueType) | ||
// UTF8String act as a pointer if it's inside UnsafeRow, so copy it to make it safe. | ||
case StringType => ExprCode("", "false", s"$input.clone()") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this copy is not needed, as we will do copy before we updating a value to the row. |
||
case udt: UserDefinedType[_] => convertToSafe(ctx, input, udt.sqlType) | ||
case _ => ExprCode("", "false", input) | ||
} | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,4 +172,40 @@ class GeneratedProjectionSuite extends SparkFunSuite { | |
assert(unsafe1 === unsafe3) | ||
assert(unsafe1.getStruct(1, 7) === unsafe3.getStruct(1, 7)) | ||
} | ||
|
||
test("MutableProjection should not cache content from the input row") { | ||
val mutableProj = GenerateMutableProjection.generate( | ||
Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
val row = new GenericInternalRow(1) | ||
mutableProj.target(row) | ||
|
||
val unsafeProj = GenerateUnsafeProjection.generate( | ||
Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
val unsafeRow = unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("a")))) | ||
|
||
mutableProj.apply(unsafeRow) | ||
assert(row.getStruct(0, 1).getString(0) == "a") | ||
|
||
// Even if the input row of the mutable projection has been changed, the target mutable row | ||
// should keep same. | ||
unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("b")))) | ||
assert(row.getStruct(0, 1).getString(0).toString == "a") | ||
} | ||
|
||
test("SafeProjection should not cache content from the input row") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has always worked right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, I add this test to make sure my change to the |
||
val safeProj = GenerateSafeProjection.generate( | ||
Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
|
||
val unsafeProj = GenerateUnsafeProjection.generate( | ||
Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
val unsafeRow = unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("a")))) | ||
|
||
val row = safeProj.apply(unsafeRow) | ||
assert(row.getStruct(0, 1).getString(0) == "a") | ||
|
||
// Even if the input row of the mutable projection has been changed, the target mutable row | ||
// should keep same. | ||
unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("b")))) | ||
assert(row.getStruct(0, 1).getString(0).toString == "a") | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is because
clone()
doesn't always make a copy, right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we should just make clone make an actual copy...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UTF8String
is public to users, so I'm hesitating to change theclone
methodThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, let's leave it then