-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-39057][SQL] Offset could work without Limit #36417
Changes from 1 commit
2db1f64
caaac4e
ddbfc0d
d9c0157
48673ea
fff48d0
e0873cd
4aaa114
1d22141
6c282c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,7 +96,6 @@ abstract class Optimizer(catalogManager: CatalogManager) | |
CollapseWindow, | ||
CombineFilters, | ||
EliminateLimits, | ||
RewriteOffsets, | ||
CombineUnions, | ||
// Constant folding and strength reduction | ||
OptimizeRepartition, | ||
|
@@ -673,7 +672,7 @@ object RemoveNoopUnion extends Rule[LogicalPlan] { | |
} | ||
|
||
/** | ||
* Pushes down [[LocalLimit]] beneath UNION ALL and joins. | ||
* Pushes down [[LocalLimit]] beneath UNION ALL, OFFSET and joins. | ||
*/ | ||
object LimitPushDown extends Rule[LogicalPlan] { | ||
|
||
|
@@ -750,6 +749,11 @@ object LimitPushDown extends Rule[LogicalPlan] { | |
Limit(le, Project(a.aggregateExpressions, LocalLimit(le, a.child))) | ||
case Limit(le @ IntegerLiteral(1), p @ Project(_, a: Aggregate)) if a.groupOnly => | ||
Limit(le, p.copy(child = Project(a.aggregateExpressions, LocalLimit(le, a.child)))) | ||
// Merge offset value and limit value into LocalLimit and pushes down LocalLimit through Offset. | ||
case LocalLimit(le, Offset(oe @ IntegerLiteral(offset), grandChild)) if offset > 0 => | ||
Offset(oe, LocalLimit(Add(le, oe), grandChild)) | ||
// Eliminate Offset if offset value equals 0. | ||
case Offset(IntegerLiteral(0), child) => child | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can add a new rule |
||
} | ||
} | ||
|
||
|
@@ -1870,23 +1874,6 @@ object EliminateLimits extends Rule[LogicalPlan] { | |
} | ||
} | ||
|
||
/** | ||
* Rewrite [[Offset]] by eliminate [[Offset]] or merge offset value and limit value into | ||
* [[LocalLimit]]. See [[Limit]] for more information about the difference between | ||
* [[LocalLimit]] and [[GlobalLimit]]. | ||
*/ | ||
object RewriteOffsets extends Rule[LogicalPlan] { | ||
def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
case localLimit @ LocalLimit(le, Offset(oe, grandChild)) => | ||
val offset = oe.eval().asInstanceOf[Int] | ||
if (offset == 0) { | ||
localLimit.withNewChildren(Seq(grandChild)) | ||
} else { | ||
Offset(oe, LocalLimit(Add(le, oe), grandChild)) | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Check if there any cartesian products between joins of any type in the optimized plan tree. | ||
* Throw an error if a cartesian product is found without an explicit cross join specified. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,10 +78,14 @@ case class CollectLimitExec(limit: Int = -1, child: SparkPlan, offset: Int = 0) | |
val singlePartitionRDD = if (childRDD.getNumPartitions == 1) { | ||
childRDD | ||
} else { | ||
val locallyLimited = if (offset > 0) { | ||
childRDD.mapPartitionsInternal(_.take(limit + offset)) | ||
val locallyLimited = if (limit >= 0) { | ||
if (offset > 0) { | ||
childRDD.mapPartitionsInternal(_.take(limit + offset)) | ||
} else { | ||
childRDD.mapPartitionsInternal(_.take(limit)) | ||
} | ||
} else { | ||
childRDD.mapPartitionsInternal(_.take(limit)) | ||
childRDD | ||
} | ||
new ShuffledRowRDD( | ||
ShuffleExchangeExec.prepareShuffleDependency( | ||
|
@@ -92,10 +96,14 @@ case class CollectLimitExec(limit: Int = -1, child: SparkPlan, offset: Int = 0) | |
writeMetrics), | ||
readMetrics) | ||
} | ||
if (offset > 0) { | ||
singlePartitionRDD.mapPartitionsInternal(_.drop(offset).take(limit)) | ||
if (limit >= 0) { | ||
if (offset > 0) { | ||
singlePartitionRDD.mapPartitionsInternal(_.drop(offset).take(limit)) | ||
} else { | ||
singlePartitionRDD.mapPartitionsInternal(_.take(limit)) | ||
} | ||
} else { | ||
singlePartitionRDD.mapPartitionsInternal(_.take(limit)) | ||
singlePartitionRDD.mapPartitionsInternal(_.drop(offset)) | ||
} | ||
} | ||
} | ||
|
@@ -223,53 +231,38 @@ case class GlobalLimitExec(limit: Int, child: SparkPlan) extends BaseLimitExec { | |
*/ | ||
case class GlobalLimitAndOffsetExec( | ||
limit: Int = -1, | ||
offset: Int = 0, | ||
offset: Int, | ||
child: SparkPlan) extends BaseLimitExec { | ||
assert(limit >= 0 || (limit == -1 && offset > 0)) | ||
assert(offset > 0) | ||
|
||
override def requiredChildDistribution: List[Distribution] = AllTuples :: Nil | ||
|
||
override def doExecute(): RDD[InternalRow] = if (limit >= 0) { | ||
if (offset > 0) { | ||
child.execute().mapPartitions(iter => iter.take(limit + offset).drop(offset)) | ||
} else { | ||
child.execute().mapPartitions(iter => iter.take(limit)) | ||
} | ||
child.execute().mapPartitions(iter => iter.take(limit + offset).drop(offset)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not related to this PR, but since we are touching here, let's use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK |
||
} else { | ||
child.execute().mapPartitions(iter => iter.drop(offset)) | ||
} | ||
|
||
private lazy val skipTerm = BaseLimitExec.newLimitCountTerm() | ||
|
||
override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { | ||
ctx.addMutableState( | ||
CodeGenerator.JAVA_INT, skipTerm, forceInline = true, useFreshName = false) | ||
if (limit >= 0) { | ||
// The counter name is already obtained by the upstream operators via `limitNotReachedChecks`. | ||
// Here we have to inline it to not change its name. This is fine as we won't have many limit | ||
// operators in one query. | ||
ctx.addMutableState( | ||
CodeGenerator.JAVA_INT, countTerm, forceInline = true, useFreshName = false) | ||
if (offset > 0) { | ||
ctx.addMutableState( | ||
CodeGenerator.JAVA_INT, skipTerm, forceInline = true, useFreshName = false) | ||
s""" | ||
| if ($skipTerm < $offset) { | ||
| $skipTerm += 1; | ||
| } else if ($countTerm < $limit) { | ||
| $countTerm += 1; | ||
| ${consume(ctx, input)} | ||
| } | ||
""".stripMargin | ||
} else { | ||
s""" | ||
| if ($countTerm < $limit) { | ||
| $countTerm += 1; | ||
| ${consume(ctx, input)} | ||
| } | ||
s""" | ||
| if ($skipTerm < $offset) { | ||
| $skipTerm += 1; | ||
| } else if ($countTerm < $limit) { | ||
| $countTerm += 1; | ||
| ${consume(ctx, input)} | ||
| } | ||
""".stripMargin | ||
} | ||
} else { | ||
ctx.addMutableState( | ||
CodeGenerator.JAVA_INT, skipTerm, forceInline = true, useFreshName = false) | ||
s""" | ||
| if ($skipTerm < $offset) { | ||
| $skipTerm += 1; | ||
|
@@ -295,7 +288,8 @@ case class TakeOrderedAndProjectExec( | |
limit: Int, | ||
sortOrder: Seq[SortOrder], | ||
projectList: Seq[NamedExpression], | ||
child: SparkPlan, offset: Int = 0) extends UnaryExecNode { | ||
child: SparkPlan, | ||
offset: Int = 0) extends UnaryExecNode { | ||
|
||
override def output: Seq[Attribute] = { | ||
projectList.map(_.toAttribute) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the pushdown is correct even if
offset == 0
. We don't needif offset > 0