-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Segregate merged returns by constant value #13792
Conversation
The JIT enforces a limit on the number of epilogs emitted in any given method. Change the logic so that when this merging kicks in, returns of constant values are given merged return blocks distinct from each other and from the general return block, as long as we can do so without going over the limit. This particularly helps avoid redundancy (while still keeping method size down) in methods with a large number of constant returns but only a few distinct constants, which is true of many predicate methods with bool return type. This is the compiler portion of #13466 and dotnet/corefx#23395.
@AndyAyersMS, @briansull PTAL jit-diff results (--frameworks --tests, Windows x64 release)
|
I'll write up some more detailed comments later, but here are some initial thoughts:
|
Wouldn't be better to merge all return blocks and then duplicate blocks as needed somewhere after lowering/lsra where information about the epilog size is available? I've seen cases where the epilog is pretty large and the cost of the |
I should have mentioned that this isn't an attempt at optimal epilog duplication/merging, just an incremental step that improves on the current state and in particular is sufficient to get the motivating cases from bullets numbered "2" in https://github.com/dotnet/coreclr/issues/13466#issuecomment-323831444 and https://github.com/dotnet/corefx/issues/23395#issuecomment-324364657. Diffs tend to look like this: diff --git "a/.\\before\\IsSubsetOf.dasm" "b/.\\after\\IsSubsetOf.dasm"
index 24b3c4c..3f1ec29 100644
--- "a/.\\before\\IsSubsetOf.dasm"
+++ "b/.\\after\\IsSubsetOf.dasm"
@@ -5,96 +5,104 @@
; partially interruptible
; Final local variable assignments
;
-; V00 this [V00,T01] ( 9, 6 ) ref -> rsi this class-hnd
-; V01 arg1 [V01,T02] ( 5, 4 ) ref -> rdi class-hnd
-; V02 loc0 [V02,T03] ( 6, 3 ) ref -> rbp class-hnd
-; V03 loc1 [V03 ] ( 3, 1.50) struct ( 8) [rsp+0x28] do-not-enreg[SF] must-init
-; V04 tmp0 [V04,T04] ( 3, 3 ) int -> rbx
+; V00 this [V00,T00] ( 9, 6 ) ref -> rsi this class-hnd
+; V01 arg1 [V01,T01] ( 5, 4 ) ref -> rdi class-hnd
+; V02 loc0 [V02,T03] ( 6, 3 ) ref -> rbx class-hnd
+; V03 loc1 [V03 ] ( 3, 1.50) struct ( 8) [rsp+0x20] do-not-enreg[SF] must-init
+; V04 tmp0 [V04,T04] ( 3, 3 ) int -> rdi
; V05 tmp1 [V05,T06] ( 3, 0 ) ref -> rdi class-hnd exact
-; V06 tmp2 [V06,T00] ( 6, 7 ) int -> rbx
-; V07 tmp3 [V07 ] ( 2, 1 ) int -> [rsp+0x28] do-not-enreg[] V03.UniqueCount(offs=0x00) P-DEP
-; V08 tmp4 [V08 ] ( 2, 1 ) int -> [rsp+0x2C] do-not-enreg[] V03.UnfoundCount(offs=0x04) P-DEP
+; V06 tmp2 [V06,T02] ( 3, 3 ) int -> rbx
+; V07 tmp3 [V07 ] ( 2, 1 ) int -> [rsp+0x20] do-not-enreg[] V03.UniqueCount(offs=0x00) P-DEP
+; V08 tmp4 [V08 ] ( 2, 1 ) int -> [rsp+0x24] do-not-enreg[] V03.UnfoundCount(offs=0x04) P-DEP
; V09 tmp5 [V09,T07] ( 2, 0 ) ref -> rdx
; V10 tmp6 [V10,T05] ( 2, 2 ) long -> rcx
; V11 OutArgs [V11 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
;
-; Lcl frame size = 56
+; Lcl frame size = 48
G_M12846_IG01:
push rdi
push rsi
- push rbp
push rbx
- sub rsp, 56
+ sub rsp, 48
xor rax, rax
- mov qword ptr [rsp+28H], rax
- mov qword ptr [rsp+30H], rcx
+ mov qword ptr [rsp+20H], rax
+ mov qword ptr [rsp+28H], rcx
mov rsi, rcx
mov rdi, rdx
G_M12846_IG02:
test rdi, rdi
- je G_M12846_IG10
+ je G_M12846_IG13
G_M12846_IG03:
mov rcx, rsi
call [SortedSet`1:get_Count():int:this]
test eax, eax
- jne SHORT G_M12846_IG04
- mov ebx, 1
- jmp G_M12846_IG08
+ je G_M12846_IG07
G_M12846_IG04:
mov rcx, qword ptr [rsi]
call [CORINFO_HELP_READYTORUN_GENERIC_HANDLE]
mov rcx, rax
mov rdx, rdi
call [CORINFO_HELP_ISINSTANCEOFANY]
- mov rbp, rax
- test rbp, rbp
+ mov rbx, rax
+ test rbx, rbx
je SHORT G_M12846_IG06
mov rcx, rsi
- mov rdx, rbp
+ mov rdx, rbx
call [SortedSet`1:HasEqualComparer(ref):bool:this]
test al, al
je SHORT G_M12846_IG06
mov rcx, rsi
call [SortedSet`1:get_Count():int:this]
- mov ebx, eax
- mov rcx, rbp
+ mov edi, eax
+ mov rcx, rbx
call [SortedSet`1:get_Count():int:this]
- cmp eax, ebx
- jge SHORT G_M12846_IG05
- xor ebx, ebx
- jmp SHORT G_M12846_IG08
+ cmp eax, edi
+ jl SHORT G_M12846_IG09
G_M12846_IG05:
mov rcx, rsi
- mov rdx, rbp
+ mov rdx, rbx
call [SortedSet`1:IsSubsetOfSortedSetWithSameComparer(ref):bool:this]
movzx rbx, al
- jmp SHORT G_M12846_IG08
+ jmp SHORT G_M12846_IG11
G_M12846_IG06:
mov rcx, rsi
mov rdx, rdi
xor r8d, r8d
call [SortedSet`1:CheckUniqueAndUnfoundElements(ref,bool):struct:this]
- mov qword ptr [rsp+28H], rax
+ mov qword ptr [rsp+20H], rax
mov rcx, rsi
call [SortedSet`1:get_Count():int:this]
- cmp eax, dword ptr [rsp+28H]
- jne SHORT G_M12846_IG07
- cmp dword ptr [rsp+2CH], 0
+ cmp eax, dword ptr [rsp+20H]
+ jne SHORT G_M12846_IG09
+ cmp dword ptr [rsp+24H], 0
setge bl
movzx rbx, bl
- jmp SHORT G_M12846_IG08
+ jmp SHORT G_M12846_IG11
G_M12846_IG07:
- xor ebx, ebx
+ mov eax, 1
G_M12846_IG08:
- movzx rax, bl
-G_M12846_IG09:
- add rsp, 56
+ add rsp, 48
pop rbx
- pop rbp
pop rsi
pop rdi
ret
+G_M12846_IG09:
+ xor eax, eax
G_M12846_IG10:
+ add rsp, 48
+ pop rbx
+ pop rsi
+ pop rdi
+ ret
+G_M12846_IG11:
+ movzx rax, bl
+G_M12846_IG12:
+ add rsp, 48
+ pop rbx
+ pop rsi
+ pop rdi
+ ret
+G_M12846_IG13:
call [CORINFO_HELP_READYTORUN_NEW]
mov rdi, rax
mov ecx, 0x849
@@ -105,5 +113,5 @@ G_M12846_IG10:
mov rcx, rdi
call CORINFO_HELP_THROW
int3
-; Total bytes of code 250, prolog size 20 for method SortedSet`1:IsSubsetOf(ref):bool:this
+; Total bytes of code 259, prolog size 19 for method SortedSet`1:IsSubsetOf(ref):bool:this We're not too graceful with the handling of the temp generated for the common merge case, especially for bool methods, so as you can see the dynamic path here replaces In light of that, regarding some of the specific points:
I agree, and likewise that it's odd to have the arbitrary limit of 4 applied to all targets; my thinking was just to untangle some of the cases that are currently hurting us from merging, and to leave this existing oddity alone (for now).
My thinking was that the current metric is effectively "5 is too many, fewer is ok", and that I'm effectively preserving that. Having a smarter metric (e.g. that includes epilog size as @mikedn suggests) certainly sounds desirable, but I'm not sure what data points are available to inform it without making this a significantly larger change...
Both of these are getting at TP/CQ tradeoff, and yes what I've done is biased toward TP in that sense. Should be straightforward enough to compute that histogram you suggest to get a better sense... |
Note: if you are tempted to go to a two-pass approach to try and prioritize merge candidates when there are lots of returns -- we've already counted up returns during the IL prescan (or at least done the equivalent), so you could presumably avoid being clever about merging if there are 4 or fewer return points. That probably amortizes the TP cost for the cases where there is no benefit sufficiently that you can spend more TP on getting the best results for cases where there is a large benefit. |
src/jit/flowgraph.cpp
Outdated
// return blocks in `returnBlocks`. | ||
BasicBlock* returnBlocks[returnCountHardLimit]; | ||
|
||
// Each constant value returned gets its own merged return block that |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this runs after the ValueNumbering phase I would suggest using the ValueNumber as the key instead of a constant.
That way a Select function could return argA or argB in several places in the method and those returns could be combined as with constants.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This runs in fgAddInternal
, well before value numbering.
Histograms as promised:
Data points are the number of distinct return constants, limited to the set of methods where discretionary return merging kicked in (more than four return sites, and none of the sync flag etc. that force us to merge down to a single return) and at least one return site returned a constant. |
Since that showed "5 constants" and "6 constants" as potentially interesting, I decided to see what the distributions looked like in that subset, to get an idea of how skewed they are to one or another of the returned values (the idea being that if it's flat, we're faced with an arbitrary choice anyway, assuming we're keeping our limit of 4 epilogs for the time being). Here's what that looked like:
|
This is from a "universe" of 339K or so methods, right? So roughly 0.2% of all methods in this set get into the merging logic, which seems plausible, and only some subset of those could potentially be further improved. I suppose we could spin this different ways:
|
I've filed #13814 to track the opportunity to implement a more robust epilog generation strategy; @AndyAyersMS, @mikedn, @briansull, I think it captures the points you've all made about that, please update if not. I still think there's merit in this change, since it is a fairly small modification of the current algorithm with minimal compile-time overhead, caps codesize growth at 4 epilogs in the small percentage of functions with constant returns, and (most importantly) improves codegen enough that people can stop writing
This is part of
My inclination is the former -- the goal here, quoting https://github.com/dotnet/coreclr/issues/13466#issuecomment-323831444, is "a very narrow pattern match [that] could cheaply identify and help with search-loop predicate methods, which the motivating cases here have been", and I think it makes more sense to tackle #13814 than to embellish this much beyond what's here. |
I'm also in the former camp:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Logic looks good, just some notes on comments.
src/jit/flowgraph.cpp
Outdated
// searching up to `searchLimit` for already-created constant return blocks if it returns a | ||
// constant. If a constant return is used, rewrite `returnBlock` to jump to it, else set | ||
// `genReturnBB`/`genReturnLocal` so the rewrite can happen at Morph. | ||
BasicBlock* Merge(BasicBlock* returnBlock, unsigned searchLimit) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe add a comment here about what it means when returnBlock
is nullptr
.
Also should this method (and others in the class) have the "standard" jit method header comments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure; done.
src/jit/flowgraph.cpp
Outdated
// and calls that return to hidden buffers. | ||
// We do go ahead and decrement fgReturnCount to reflect that this block will | ||
// eventually be rewritten. | ||
comp->fgReturnCount--; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm probably missing something here, but the comment seems stale.
If we found a constant return for this return point then we did rewrite the return. Maybe the first part should say "we didn't rewrite the return for cases where the return will use the common return block"?
And the second part, seems like merge always "succeeds" unless it is in the eager create mode and so decrementing is something that is always done, whether or not rewriting happened.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mentioned "for the non-const cases" in the first clause, but yeah the way the method flows now there's not a convenient place for a codepath representing that distinction, so I've moved the comments about this up to the method headers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it's a parsing ambiguity -- is a "non-const" case describing the input or the output? I've been thinking it applied to the input....
Now that the `GT_RETURN` is created eagerly, these need to use `fgInsertStmtNearEnd` rather than `fgInsertStmtAtEnd`.
This is more clearly expressed in the method header comments.
Updated to incorporate @AndyAyersMS's feedback and to fix the issue that popped up in CI where I'm not creating the return statement for |
Thanks for the updates. |
It's a bit too big to justify splitting up `fgAddInteral`.
Pushed one more update: pulled the new class definition out of the enclosing method, to help the method's readability. |
Remove some `goto`s that were added to work around undesirable jit layout (#9692, fixed in dotnet#13314) and epilog factoring (improved in dotnet#13792 and dotnet#13903), which are no longer needed. Resolves #13466.
The JIT enforces a limit on the number of epilogs emitted in any given
method. Change the logic so that when this merging kicks in, returns of
constant values are given merged return blocks distinct from each other
and from the general return block, as long as we can do so without going
over the limit. This particularly helps avoid redundancy (while still
keeping method size down) in methods with a large number of constant
returns but only a few distinct constants, which is true of many
predicate methods with bool return type.
This is the compiler portion of #13466 and dotnet/corefx#23395.