From 388edb64f004e1c24b350edca43cf9fd2fdb7a4e Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Tue, 11 Apr 2023 11:08:28 +0200 Subject: [PATCH] JIT: Add a (disabled) prototype for a generalized promotion pass (#83388) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a "physical" promotion pass that generalizes the existing promotion. More specifically, it does not have restrictions on field count and it can handle arbitrary recursive promotion. The pass is physical in the sense that it does not rely on any field metadata for structs. Instead, it works in two separate passes over the IR: 1. In the first pass we find and analyze how unpromoted struct locals are accessed. For example, for a simple program like: ``` public static void Main() { S s = default; Call(s, s.C); Console.WriteLine(s.B + s.C); } [MethodImpl(MethodImplOptions.NoInlining)] private static void Call(S s, byte b) { } private struct S { public byte A, B, C, D, E; } ``` we see IR like: ``` ***** BB01 STMT00000 ( 0x000[E-] ... 0x003 ) [000003] IA--------- ▌ ASG struct (init) [000001] D------N--- ├──▌ LCL_VAR struct V00 loc0 [000002] ----------- └──▌ CNS_INT int 0 ***** BB01 STMT00001 ( 0x008[E-] ... 0x026 ) [000008] --C-G------ ▌ CALL void Program:Call(Program+S,ubyte) [000004] ----------- arg0 ├──▌ LCL_VAR struct V00 loc0 [000007] ----------- arg1 └──▌ LCL_FLD ubyte V00 loc0 [+2] ***** BB01 STMT00002 ( 0x014[E-] ... ??? ) [000016] --C-G------ ▌ CALL void System.Console:WriteLine(int) [000015] ----------- arg0 └──▌ ADD int [000011] ----------- ├──▌ LCL_FLD ubyte V00 loc0 [+1] [000014] ----------- └──▌ LCL_FLD ubyte V00 loc0 [+2] ``` and the analysis produces ``` Accesses for V00 [000..005) #: (2, 200) # assigned from: (0, 0) # assigned to: (1, 100) # as call arg: (1, 100) # as implicit by-ref call arg: (1, 100) # as on-stack call arg: (0, 0) # as retbuf: (0, 0) # as returned value: (0, 0) ubyte @ 001 #: (1, 100) # assigned from: (0, 0) # assigned to: (0, 0) # as call arg: (0, 0) # as implicit by-ref call arg: (0, 0) # as on-stack call arg: (0, 0) # as retbuf: (0, 0) # as returned value: (0, 0) ubyte @ 002 #: (2, 200) # assigned from: (0, 0) # assigned to: (0, 0) # as call arg: (1, 100) # as implicit by-ref call arg: (0, 0) # as on-stack call arg: (0, 0) # as retbuf: (0, 0) # as returned value: (0, 0) ``` Here the pairs are (#ref counts, wtd ref counts). Based on this accounting, the analysis estimates the profitability of replacing some of the accessed parts of the struct with a local. This may be costly because overlapping struct accesses (e.g. passing the whole struct as an argument) may require more expensive codegen after promotion. And of course, creating new locals introduces more register pressure. Currently the profitability analysis is very crude. In this case the logic decides that promotion is not worth it: ``` Evaluating access ubyte @ 001 Single write-back cost: 5 Write backs: 100 Read backs: 100 Cost with: 1350 Cost without: 650 Disqualifying replacement Evaluating access ubyte @ 002 Single write-back cost: 5 Write backs: 100 Read backs: 100 Cost with: 1700 Cost without: 1300 Disqualifying replacement ``` 2. In the second pass the field accesses are replaced with new locals for the profitable cases. For overlapping accesses that currently involves writing back replacements to the struct local first. For arguments/OSR locals, it involves reading them back from the struct first. In the above case we can override the profitability analysis with stress mode STRESS_PHYSICAL_PROMOTION_COST and we get: ``` Evaluating access ubyte @ 001 Single write-back cost: 5 Write backs: 100 Read backs: 100 Cost with: 1350 Cost without: 650 Promoting replacement due to stress lvaGrabTemp returning 2 (V02 tmp1) (a long lifetime temp) called for V00.[001..002). Evaluating access ubyte @ 002 Single write-back cost: 5 Write backs: 100 Read backs: 100 Cost with: 1700 Cost without: 1300 Promoting replacement due to stress lvaGrabTemp returning 3 (V03 tmp2) (a long lifetime temp) called for V00.[002..003). V00 promoted with 2 replacements [001..002) promoted as ubyte V02 [002..003) promoted as ubyte V03 ... ***** BB01 STMT00000 ( 0x000[E-] ... 0x003 ) [000003] IA--------- ▌ ASG struct (init) [000001] D------N--- ├──▌ LCL_VAR struct V00 loc0 [000002] ----------- └──▌ CNS_INT int 0 ***** BB01 STMT00001 ( 0x008[E-] ... 0x026 ) [000008] -ACXG------ ▌ CALL void Program:Call(Program+S,ubyte) [000004] ----------- arg0 ├──▌ LCL_VAR struct V00 loc0 [000022] -A--------- arg1 └──▌ COMMA ubyte [000021] -A--------- ├──▌ ASG ubyte [000019] D------N--- │ ├──▌ LCL_VAR ubyte V03 tmp2 [000020] ----------- │ └──▌ LCL_FLD ubyte V00 loc0 [+2] [000018] ----------- └──▌ LCL_VAR ubyte V03 tmp2 ***** BB01 STMT00002 ( 0x014[E-] ... ??? ) [000016] -ACXG------ ▌ CALL void System.Console:WriteLine(int) [000015] -A--------- arg0 └──▌ ADD int [000027] -A--------- ├──▌ COMMA ubyte [000026] -A--------- │ ├──▌ ASG ubyte [000024] D------N--- │ │ ├──▌ LCL_VAR ubyte V02 tmp1 [000025] ----------- │ │ └──▌ LCL_FLD ubyte V00 loc0 [+1] [000023] ----------- │ └──▌ LCL_VAR ubyte V02 tmp1 [000028] ----------- └──▌ LCL_VAR ubyte V03 tmp2 ``` The pass still only has rudimentary support and is missing many basic CQ optimization optimizations. For example, it does not make use of any liveness yet and it does not have any decomposition support for assignments. Yet, it already shows good potential in user benchmarks. I have listed some follow-up improvements in #76928. This PR is adding the pass but it is disabled by default. It can be enabled by setting DOTNET_JitStressModeNames=STRESS_PHYSICAL_PROMOTION. There are two new scenarios added to jit-experimental that enables it, to be used for testing purposes. --- .../templates/runtimes/run-test-job.yml | 2 + src/coreclr/jit/CMakeLists.txt | 2 + src/coreclr/jit/assertionprop.cpp | 16 + src/coreclr/jit/compiler.cpp | 4 + src/coreclr/jit/compiler.h | 14 +- src/coreclr/jit/compmemkind.h | 1 + src/coreclr/jit/compphases.h | 1 + src/coreclr/jit/fgopt.cpp | 67 +- src/coreclr/jit/fgstmt.cpp | 6 +- src/coreclr/jit/gentree.cpp | 37 +- src/coreclr/jit/jitconfigvalues.h | 1 + src/coreclr/jit/jitstd/vector.h | 2 +- src/coreclr/jit/morph.cpp | 12 +- src/coreclr/jit/promotion.cpp | 1282 +++++++++++++++++ src/coreclr/jit/promotion.h | 33 + src/tests/Common/testenvironment.proj | 3 + 16 files changed, 1442 insertions(+), 41 deletions(-) create mode 100644 src/coreclr/jit/promotion.cpp create mode 100644 src/coreclr/jit/promotion.h diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index 63366eebd928c2..d6c49f05be77b9 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -583,6 +583,8 @@ jobs: - jitpartialcompilation - jitpartialcompilation_pgo - jitobjectstackallocation + - jitgeneralizedpromotion + - jitgeneralizedpromotion_full ${{ if in(parameters.testGroup, 'jit-cfg') }}: scenarios: diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt index d5ba36b8eec52f..1f27136af67305 100644 --- a/src/coreclr/jit/CMakeLists.txt +++ b/src/coreclr/jit/CMakeLists.txt @@ -158,6 +158,7 @@ set( JIT_SOURCES optimizer.cpp patchpoint.cpp phase.cpp + promotion.cpp rangecheck.cpp rationalize.cpp redundantbranchopts.cpp @@ -348,6 +349,7 @@ set( JIT_HEADERS objectalloc.h opcode.h phase.h + promotion.h rangecheck.h rationalize.h regalloc.h diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 2f63f7ceba7d2d..7cdc61483e2cd7 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -1256,6 +1256,22 @@ AssertionIndex Compiler::optCreateAssertion(GenTree* op1, goto DONE_ASSERTION; // Don't make an assertion } + // We process locals when we see the LCL_VAR node instead + // of at its actual use point (its parent). That opens us + // up to problems in a case like the following, assuming we + // allowed creating an assertion like V10 = V35: + // + // └──▌ ADD int + // ├──▌ LCL_VAR int V10 tmp6 -> copy propagated to [V35 tmp31] + // └──▌ COMMA int + // ├──▌ ASG int + // │ ├──▌ LCL_VAR int V35 tmp31 + // │ └──▌ LCL_FLD int V03 loc1 [+4] + if (lclVar2->lvRedefinedInEmbeddedStatement) + { + goto DONE_ASSERTION; // Don't make an assertion + } + assertion.op2.kind = O2K_LCLVAR_COPY; assertion.op2.vn = optConservativeNormalVN(op2); assertion.op2.lcl.lclNum = lclNum2; diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 92bdfa5ea4340b..2c3c0dbdec25dc 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4745,6 +4745,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // DoPhase(this, PHASE_EARLY_LIVENESS, &Compiler::fgEarlyLiveness); + // Promote struct locals based on primitive access patterns + // + DoPhase(this, PHASE_PHYSICAL_PROMOTION, &Compiler::PhysicalPromotion); + // Run a simple forward substitution pass. // DoPhase(this, PHASE_FWD_SUB, &Compiler::fgForwardSub); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 0c2797bae35382..5674e2aa25d85c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -661,6 +661,8 @@ class LclVarDsc unsigned char lvIsOSRExposedLocal : 1; // OSR local that was address exposed in Tier0 + unsigned char lvRedefinedInEmbeddedStatement : 1; // Local has redefinitions inside embedded statements that + // disqualify it from local copy prop. private: unsigned char lvIsNeverNegative : 1; // The local is known to be never negative @@ -2030,6 +2032,9 @@ class Compiler friend class CallArgs; friend class IndirectCallTransformer; friend class ProfileSynthesis; + friend class LocalsUseVisitor; + friend class Promotion; + friend class ReplaceVisitor; #ifdef FEATURE_HW_INTRINSICS friend struct HWIntrinsicInfo; @@ -2449,7 +2454,7 @@ class Compiler GenTree* gtNewOperNode(genTreeOps oper, var_types type, GenTree* op1); // For binary opers. - GenTree* gtNewOperNode(genTreeOps oper, var_types type, GenTree* op1, GenTree* op2); + GenTreeOp* gtNewOperNode(genTreeOps oper, var_types type, GenTree* op1, GenTree* op2); GenTreeCC* gtNewCC(genTreeOps oper, var_types type, GenCondition cond); GenTreeOpCC* gtNewOperCC(genTreeOps oper, var_types type, GenCondition cond, GenTree* op1, GenTree* op2); @@ -5740,9 +5745,9 @@ class Compiler private: void fgInsertStmtNearEnd(BasicBlock* block, Statement* stmt); void fgInsertStmtAtBeg(BasicBlock* block, Statement* stmt); - void fgInsertStmtAfter(BasicBlock* block, Statement* insertionPoint, Statement* stmt); public: + void fgInsertStmtAfter(BasicBlock* block, Statement* insertionPoint, Statement* stmt); void fgInsertStmtBefore(BasicBlock* block, Statement* insertionPoint, Statement* stmt); private: @@ -6078,6 +6083,8 @@ class Compiler PhaseStatus fgMarkAddressExposedLocals(); void fgSequenceLocals(Statement* stmt); + PhaseStatus PhysicalPromotion(); + PhaseStatus fgForwardSub(); bool fgForwardSubBlock(BasicBlock* block); bool fgForwardSubStatement(Statement* statement); @@ -9720,6 +9727,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX STRESS_MODE(SSA_INFO) /* Select lower thresholds for "complex" SSA num encoding */ \ STRESS_MODE(SPLIT_TREES_RANDOMLY) /* Split all statements at a random tree */ \ STRESS_MODE(SPLIT_TREES_REMOVE_COMMAS) /* Remove all GT_COMMA nodes */ \ + STRESS_MODE(NO_OLD_PROMOTION) /* Do not use old promotion */ \ + STRESS_MODE(PHYSICAL_PROMOTION) /* Use physical promotion */ \ + STRESS_MODE(PHYSICAL_PROMOTION_COST) \ \ /* After COUNT_VARN, stress level 2 does all of these all the time */ \ \ diff --git a/src/coreclr/jit/compmemkind.h b/src/coreclr/jit/compmemkind.h index 03a8f56d28bc93..645a6b44f80ee2 100644 --- a/src/coreclr/jit/compmemkind.h +++ b/src/coreclr/jit/compmemkind.h @@ -50,6 +50,7 @@ CompMemKindMacro(LoopHoist) CompMemKindMacro(Unknown) CompMemKindMacro(RangeCheck) CompMemKindMacro(CopyProp) +CompMemKindMacro(Promotion) CompMemKindMacro(SideEffects) CompMemKindMacro(ObjectAllocator) CompMemKindMacro(VariableLiveRanges) diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index d8237c91270bcb..380279139e7435 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -43,6 +43,7 @@ CompPhaseNameMacro(PHASE_UPDATE_FINALLY_FLAGS, "Update finally target flag CompPhaseNameMacro(PHASE_EARLY_UPDATE_FLOW_GRAPH, "Update flow graph early pass", false, -1, false) CompPhaseNameMacro(PHASE_STR_ADRLCL, "Morph - Structs/AddrExp", false, -1, false) CompPhaseNameMacro(PHASE_EARLY_LIVENESS, "Early liveness", false, -1, false) +CompPhaseNameMacro(PHASE_PHYSICAL_PROMOTION, "Physical promotion", false, -1, false) CompPhaseNameMacro(PHASE_FWD_SUB, "Forward Substitution", false, -1, false) CompPhaseNameMacro(PHASE_MORPH_IMPBYREF, "Morph - ByRefs", false, -1, false) CompPhaseNameMacro(PHASE_PROMOTE_STRUCTS, "Morph - Promote Structs", false, -1, false) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index aae721e7135a15..eb0605667a0ee8 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -6774,45 +6774,52 @@ PhaseStatus Compiler::fgTailMerge() // for (BasicBlock* const predBlock : block->PredBlocks()) { - if ((predBlock->GetUniqueSucc() == block) && BasicBlock::sameEHRegion(block, predBlock)) + if (predBlock->GetUniqueSucc() != block) { - Statement* lastStmt = predBlock->lastStmt(); + continue; + } - // Block might be empty. - // - if (lastStmt == nullptr) - { - continue; - } + if (!BasicBlock::sameEHRegion(block, predBlock)) + { + continue; + } - // Walk back past any GT_NOPs. - // - Statement* const firstStmt = predBlock->firstStmt(); - while (lastStmt->GetRootNode()->OperIs(GT_NOP)) - { - if (lastStmt == firstStmt) - { - // predBlock is evidently all GT_NOP. - // - lastStmt = nullptr; - break; - } + Statement* lastStmt = predBlock->lastStmt(); - lastStmt = lastStmt->GetPrevStmt(); - } + // Block might be empty. + // + if (lastStmt == nullptr) + { + continue; + } - // Block might be effectively empty. - // - if (lastStmt == nullptr) + // Walk back past any GT_NOPs. + // + Statement* const firstStmt = predBlock->firstStmt(); + while (lastStmt->GetRootNode()->OperIs(GT_NOP)) + { + if (lastStmt == firstStmt) { - continue; + // predBlock is evidently all GT_NOP. + // + lastStmt = nullptr; + break; } - // We don't expect to see PHIs but watch for them anyways. - // - assert(!lastStmt->IsPhiDefnStmt()); - predInfo.Emplace(predBlock, lastStmt); + lastStmt = lastStmt->GetPrevStmt(); + } + + // Block might be effectively empty. + // + if (lastStmt == nullptr) + { + continue; } + + // We don't expect to see PHIs but watch for them anyways. + // + assert(!lastStmt->IsPhiDefnStmt()); + predInfo.Emplace(predBlock, lastStmt); } // Are there enough preds to make it interesting? diff --git a/src/coreclr/jit/fgstmt.cpp b/src/coreclr/jit/fgstmt.cpp index 2269e19896308b..4b8fff128399f8 100644 --- a/src/coreclr/jit/fgstmt.cpp +++ b/src/coreclr/jit/fgstmt.cpp @@ -386,11 +386,15 @@ Statement* Compiler::fgNewStmtFromTree(GenTree* tree, BasicBlock* block, const D { Statement* stmt = gtNewStmt(tree, di); - if (fgNodeThreading != NodeThreading::None) + if (fgNodeThreading == NodeThreading::AllTrees) { gtSetStmtInfo(stmt); fgSetStmtSeq(stmt); } + else if (fgNodeThreading == NodeThreading::AllLocals) + { + fgSequenceLocals(stmt); + } #if DEBUG if (block != nullptr) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9d8895b19b5412..b90dc75dede0a5 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7012,7 +7012,7 @@ void GenTree::SetVtableForOper(genTreeOps oper) } #endif // DEBUGGABLE_GENTREE -GenTree* Compiler::gtNewOperNode(genTreeOps oper, var_types type, GenTree* op1, GenTree* op2) +GenTreeOp* Compiler::gtNewOperNode(genTreeOps oper, var_types type, GenTree* op1, GenTree* op2) { assert(op1 != nullptr); assert(op2 != nullptr); @@ -7021,7 +7021,7 @@ GenTree* Compiler::gtNewOperNode(genTreeOps oper, var_types type, GenTree* op1, // should call the appropriate constructor for the extended type. assert(!GenTree::IsExOp(GenTree::OperKind(oper))); - GenTree* node = new (this, oper) GenTreeOp(oper, type, op1, op2); + GenTreeOp* node = new (this, oper) GenTreeOp(oper, type, op1, op2); return node; } @@ -8397,7 +8397,7 @@ GenTree* Compiler::gtClone(GenTree* tree, bool complexOK) return nullptr; } - if (tree->gtOper == GT_FIELD) + if (tree->OperIs(GT_FIELD)) { GenTree* objp = nullptr; @@ -16263,6 +16263,34 @@ bool Compiler::gtSplitTree( return false; } + bool IsValue(const UseInfo& useInf) + { + GenTree* node = (*useInf.Use)->gtEffectiveVal(); + if (!node->IsValue()) + { + return false; + } + + if (node->OperIs(GT_ASG)) + { + return false; + } + + GenTree* user = useInf.User; + + if (user == nullptr) + { + return false; + } + + if (user->OperIs(GT_COMMA) && (&user->AsOp()->gtOp1 == useInf.Use)) + { + return false; + } + + return true; + } + void SplitOutUse(const UseInfo& useInf, bool userIsReturned) { GenTree** use = useInf.Use; @@ -16328,8 +16356,7 @@ bool Compiler::gtSplitTree( } Statement* stmt = nullptr; - if (!(*use)->IsValue() || (*use)->gtEffectiveVal()->OperIs(GT_ASG) || (user == nullptr) || - (user->OperIs(GT_COMMA) && (user->gtGetOp1() == *use))) + if (!IsValue(useInf)) { GenTree* sideEffects = nullptr; m_compiler->gtExtractSideEffList(*use, &sideEffects); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 0737f065859b11..9400b3017cd5e4 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -431,6 +431,7 @@ CONFIG_STRING(JitEnableVNBasedDeadStoreRemovalRange, W("JitEnableVNBasedDeadStor CONFIG_STRING(JitEnableEarlyLivenessRange, W("JitEnableEarlyLivenessRange")) CONFIG_STRING(JitOnlyOptimizeRange, W("JitOnlyOptimizeRange")) // If set, all methods that do _not_ match are forced into MinOpts +CONFIG_STRING(JitEnablePhysicalPromotionRange, W("JitEnablePhysicalPromotionRange")) CONFIG_INTEGER(JitDoSsa, W("JitDoSsa"), 1) // Perform Static Single Assignment (SSA) numbering on the variables CONFIG_INTEGER(JitDoValueNumber, W("JitDoValueNumber"), 1) // Perform value numbering on method expressions diff --git a/src/coreclr/jit/jitstd/vector.h b/src/coreclr/jit/jitstd/vector.h index 6a547be93c0b89..268ce3a0c43e85 100644 --- a/src/coreclr/jit/jitstd/vector.h +++ b/src/coreclr/jit/jitstd/vector.h @@ -733,7 +733,7 @@ void vector::insert_elements_helper(iterator iter, size_type size, ensure_capacity(m_nSize + size); - for (int src = m_nSize - 1, dst = m_nSize + size - 1; src >= (int) pos; --src, --dst) + for (int src = (int)(m_nSize - 1), dst = (int)(m_nSize + size - 1); src >= (int) pos; --src, --dst) { m_pArray[dst] = m_pArray[src]; } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index ac2826ccec4c5b..41f69fe191d0ff 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -3328,7 +3328,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call) if (argLclNum != BAD_VAR_NUM) { - argObj->ChangeType(argVarDsc->TypeGet()); + argx->ChangeType(argVarDsc->TypeGet()); argObj->SetOper(GT_LCL_VAR); argObj->AsLclVar()->SetLclNum(argLclNum); } @@ -3346,7 +3346,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call) { // TODO-CQ: perform this transformation in lowering instead of here and // avoid marking enregisterable structs DNER. - argObj->ChangeType(structBaseType); + argx->ChangeType(structBaseType); if (argObj->OperIs(GT_LCL_VAR)) { argObj->SetOper(GT_LCL_FLD); @@ -14751,6 +14751,14 @@ PhaseStatus Compiler::fgPromoteStructs() return PhaseStatus::MODIFIED_NOTHING; } +#ifdef DEBUG + if (compStressCompile(STRESS_NO_OLD_PROMOTION, 10)) + { + JITDUMP(" skipping due to stress\n"); + return PhaseStatus::MODIFIED_NOTHING; + } +#endif + #if 0 // The code in this #if has been useful in debugging struct promotion issues, by // enabling selective enablement of the struct promotion optimization according to diff --git a/src/coreclr/jit/promotion.cpp b/src/coreclr/jit/promotion.cpp new file mode 100644 index 00000000000000..e23bae7133624e --- /dev/null +++ b/src/coreclr/jit/promotion.cpp @@ -0,0 +1,1282 @@ +#include "jitpch.h" +#include "promotion.h" +#include "jitstd/algorithm.h" + +//------------------------------------------------------------------------ +// PhysicalPromotion: Promote structs based on primitive access patterns. +// +// Returns: +// Suitable phase status. +// +PhaseStatus Compiler::PhysicalPromotion() +{ + if (!opts.OptEnabled(CLFLG_STRUCTPROMOTE)) + { + return PhaseStatus::MODIFIED_NOTHING; + } + + if (fgNoStructPromotion) + { + return PhaseStatus::MODIFIED_NOTHING; + } + + if (!compStressCompile(STRESS_PHYSICAL_PROMOTION, 25)) + { + return PhaseStatus::MODIFIED_NOTHING; + } + +#ifdef DEBUG + static ConfigMethodRange s_range; + s_range.EnsureInit(JitConfig.JitEnablePhysicalPromotionRange()); + + if (!s_range.Contains(info.compMethodHash())) + { + return PhaseStatus::MODIFIED_NOTHING; + } +#endif + + Promotion prom(this); + return prom.Run(); +} + +// Represents an access into a struct local. +struct Access +{ + ClassLayout* Layout; + unsigned Offset; + var_types AccessType; + + // Number of times we saw this access. + unsigned Count = 0; + // Number of times this access is on the RHS of an assignment. + unsigned CountAssignmentSource = 0; + // Number of times this access is on the LHS of an assignment. + unsigned CountAssignmentDestination = 0; + unsigned CountCallArgs = 0; + unsigned CountReturns = 0; + unsigned CountPassedAsRetbuf = 0; + + weight_t CountWtd = 0; + weight_t CountAssignmentSourceWtd = 0; + weight_t CountAssignmentDestinationWtd = 0; + weight_t CountCallArgsWtd = 0; + weight_t CountReturnsWtd = 0; + weight_t CountPassedAsRetbufWtd = 0; + + Access(unsigned offset, var_types accessType, ClassLayout* layout) + : Layout(layout), Offset(offset), AccessType(accessType) + { + } + + unsigned GetAccessSize() const + { + return AccessType == TYP_STRUCT ? Layout->GetSize() : genTypeSize(AccessType); + } + + bool Overlaps(unsigned otherStart, unsigned otherSize) const + { + unsigned end = Offset + GetAccessSize(); + if (end <= otherStart) + { + return false; + } + + unsigned otherEnd = otherStart + otherSize; + if (otherEnd <= Offset) + { + return false; + } + + return true; + } +}; + +//------------------------------------------------------------------------ +// BinarySearch: +// Find first entry with an equal offset, or bitwise complement of first +// entry with a higher offset. +// +// Parameters: +// vec - The vector to binary search in +// offset - The offset to search for +// +// Returns: +// Index of the first entry with an equal offset, or bitwise complement of +// first entry with a higher offset. +// +template +static size_t BinarySearch(const jitstd::vector& vec, unsigned offset) +{ + size_t min = 0; + size_t max = vec.size(); + while (min < max) + { + size_t mid = min + (max - min) / 2; + if (vec[mid].*field == offset) + { + while (mid > 0 && vec[mid - 1].*field == offset) + { + mid--; + } + + return mid; + } + if (vec[mid].*field < offset) + { + min = mid + 1; + } + else + { + max = mid; + } + } + + return ~min; +} + +// Represents a single replacement of a (field) access into a struct local. +struct Replacement +{ + unsigned Offset; + var_types AccessType; + unsigned LclNum; + // Is the replacement local (given by LclNum) fresher than the value in the struct local? + bool NeedsWriteBack = true; + // Is the value in the struct local fresher than the replacement local? + bool NeedsReadBack = false; + + Replacement(unsigned offset, var_types accessType, unsigned lclNum) + : Offset(offset), AccessType(accessType), LclNum(lclNum) + { + } + + bool Overlaps(unsigned otherStart, unsigned otherSize) const + { + unsigned end = Offset + genTypeSize(AccessType); + if (end <= otherStart) + { + return false; + } + + unsigned otherEnd = otherStart + otherSize; + if (otherEnd <= Offset) + { + return false; + } + + return true; + } +}; + +enum class AccessKindFlags : uint32_t +{ + None = 0, + IsCallArg = 1, + IsAssignmentSource = 2, + IsAssignmentDestination = 4, + IsCallRetBuf = 8, + IsReturned = 16, +}; + +inline constexpr AccessKindFlags operator~(AccessKindFlags a) +{ + return (AccessKindFlags)(~(uint32_t)a); +} + +inline constexpr AccessKindFlags operator|(AccessKindFlags a, AccessKindFlags b) +{ + return (AccessKindFlags)((uint32_t)a | (uint32_t)b); +} + +inline constexpr AccessKindFlags operator&(AccessKindFlags a, AccessKindFlags b) +{ + return (AccessKindFlags)((uint32_t)a & (uint32_t)b); +} + +inline AccessKindFlags& operator|=(AccessKindFlags& a, AccessKindFlags b) +{ + return a = (AccessKindFlags)((uint32_t)a | (uint32_t)b); +} + +inline AccessKindFlags& operator&=(AccessKindFlags& a, AccessKindFlags b) +{ + return a = (AccessKindFlags)((uint32_t)a & (uint32_t)b); +} + +// Tracks all the accesses into one particular struct local. +class LocalUses +{ + jitstd::vector m_accesses; + +public: + LocalUses(Compiler* comp) : m_accesses(comp->getAllocator(CMK_Promotion)) + { + } + + //------------------------------------------------------------------------ + // RecordAccess: + // Record an access into this local with the specified offset and access type. + // + // Parameters: + // offs - The offset being accessed + // accessType - The type of the access + // accessLayout - The layout of the access, for accessType == TYP_STRUCT + // flags - Flags classifying the access + // weight - Weight of the block containing the access + // + void RecordAccess( + unsigned offs, var_types accessType, ClassLayout* accessLayout, AccessKindFlags flags, weight_t weight) + { + Access* access = nullptr; + + size_t index = 0; + if (m_accesses.size() > 0) + { + index = BinarySearch(m_accesses, offs); + if ((ssize_t)index >= 0) + { + do + { + Access& candidateAccess = m_accesses[index]; + if ((candidateAccess.AccessType == accessType) && (candidateAccess.Layout == accessLayout)) + { + access = &candidateAccess; + break; + } + + index++; + } while (index < m_accesses.size() && m_accesses[index].Offset == offs); + } + else + { + index = ~index; + } + } + + if (access == nullptr) + { + access = &*m_accesses.insert(m_accesses.begin() + index, Access(offs, accessType, accessLayout)); + } + + access->Count++; + access->CountWtd += weight; + + if ((flags & AccessKindFlags::IsAssignmentSource) != AccessKindFlags::None) + { + access->CountAssignmentSource++; + access->CountAssignmentSourceWtd += weight; + } + + if ((flags & AccessKindFlags::IsAssignmentDestination) != AccessKindFlags::None) + { + access->CountAssignmentDestination++; + access->CountAssignmentDestinationWtd += weight; + } + + if ((flags & AccessKindFlags::IsCallArg) != AccessKindFlags::None) + { + access->CountCallArgs++; + access->CountCallArgsWtd += weight; + } + + if ((flags & AccessKindFlags::IsCallRetBuf) != AccessKindFlags::None) + { + access->CountPassedAsRetbuf++; + access->CountPassedAsRetbufWtd += weight; + } + + if ((flags & AccessKindFlags::IsReturned) != AccessKindFlags::None) + { + access->CountReturns++; + access->CountReturnsWtd += weight; + } + } + + //------------------------------------------------------------------------ + // PickPromotions: + // Pick specific replacements to make for this struct local after a set + // of accesses have been recorded. + // + // Parameters: + // comp - Compiler instance + // lclNum - Local num for this struct local + // replacements - [out] Pointer to vector to create and insert replacements into + // + void PickPromotions(Compiler* comp, unsigned lclNum, jitstd::vector** replacements) + { + if (m_accesses.size() <= 0) + { + return; + } + + assert(*replacements == nullptr); + for (size_t i = 0; i < m_accesses.size(); i++) + { + const Access& access = m_accesses[i]; + + if (access.AccessType == TYP_STRUCT) + { + continue; + } + + if (!EvaluateReplacement(comp, lclNum, access)) + { + continue; + } + +#ifdef DEBUG + char buf[32]; + sprintf_s(buf, sizeof(buf), "V%02u.[%03u..%03u)", lclNum, access.Offset, + access.Offset + genTypeSize(access.AccessType)); + size_t len = strlen(buf) + 1; + char* bufp = new (comp, CMK_DebugOnly) char[len]; + strcpy_s(bufp, len, buf); +#endif + unsigned newLcl = comp->lvaGrabTemp(false DEBUGARG(bufp)); + LclVarDsc* dsc = comp->lvaGetDesc(newLcl); + dsc->lvType = access.AccessType; + + if (*replacements == nullptr) + { + *replacements = + new (comp, CMK_Promotion) jitstd::vector(comp->getAllocator(CMK_Promotion)); + } + + (*replacements)->push_back(Replacement(access.Offset, access.AccessType, newLcl)); + } + } + + //------------------------------------------------------------------------ + // EvaluateReplacement: + // Evaluate legality and profitability of a single replacement candidate. + // + // Parameters: + // comp - Compiler instance + // lclNum - Local num for this struct local + // access - Access information for the candidate. + // + // Returns: + // True if we should promote this access and create a replacement; otherwise false. + // + bool EvaluateReplacement(Compiler* comp, unsigned lclNum, const Access& access) + { + weight_t countOverlappedCallsWtd = 0; + weight_t countOverlappedReturnsWtd = 0; + weight_t countOverlappedRetbufsWtd = 0; + weight_t countOverlappedAssignmentDestinationWtd = 0; + weight_t countOverlappedAssignmentSourceWtd = 0; + + bool overlap = false; + for (const Access& otherAccess : m_accesses) + { + if (&otherAccess == &access) + continue; + + if (!otherAccess.Overlaps(access.Offset, genTypeSize(access.AccessType))) + { + continue; + } + + if (otherAccess.AccessType != TYP_STRUCT) + { + return false; + } + + countOverlappedCallsWtd += otherAccess.CountCallArgsWtd; + countOverlappedReturnsWtd += otherAccess.CountReturnsWtd; + countOverlappedRetbufsWtd += otherAccess.CountPassedAsRetbufWtd; + countOverlappedAssignmentDestinationWtd += otherAccess.CountAssignmentDestinationWtd; + countOverlappedAssignmentSourceWtd += otherAccess.CountAssignmentSourceWtd; + } + + // TODO-CQ: Tune the following heuristics. Currently they are based on + // x64 code size although using BB weights when available. This mixing + // does not make sense. + weight_t costWithout = 0; + + // A normal access without promotion looks like: + // mov reg, [reg+offs] + // It may also be contained. Overall we are going to cost each use of + // an unpromoted local at 6.5 bytes. + // TODO-CQ: We can make much better guesses on what will and won't be contained. + costWithout += access.CountWtd * 6.5; + + weight_t costWith = 0; + + // For any use we expect to just use the register directly. We will cost this at 3.5 bytes. + costWith += access.CountWtd * 3.5; + + weight_t countReadBacksWtd = 0; + LclVarDsc* lcl = comp->lvaGetDesc(lclNum); + // For parameters or OSR locals we need an initial read back + if (lcl->lvIsParam || lcl->lvIsOSRLocal) + { + countReadBacksWtd += comp->fgFirstBB->getBBWeight(comp); + } + + countReadBacksWtd += countOverlappedRetbufsWtd; + countReadBacksWtd += countOverlappedAssignmentDestinationWtd; + + // A read back puts the value from stack back to (hopefully) register. We cost it at 5 bytes. + costWith += countReadBacksWtd * 5; + + // Write backs with TYP_REFs when the base local is an implicit byref + // involves checked write barriers, so they are very expensive. + // TODO-CQ: This should be adjusted once we type implicit byrefs as TYP_I_IMPL. + weight_t writeBackCost = comp->lvaIsImplicitByRefLocal(lclNum) && (access.AccessType == TYP_REF) ? 15 : 5; + weight_t countWriteBacksWtd = + countOverlappedCallsWtd + countOverlappedReturnsWtd + countOverlappedAssignmentSourceWtd; + costWith += countWriteBacksWtd * writeBackCost; + + JITDUMP("Evaluating access %s @ %03u\n", varTypeName(access.AccessType), access.Offset); + JITDUMP(" Single write-back cost: " FMT_WT "\n", writeBackCost); + JITDUMP(" Write backs: " FMT_WT "\n", countWriteBacksWtd); + JITDUMP(" Read backs: " FMT_WT "\n", countReadBacksWtd); + JITDUMP(" Cost with: " FMT_WT "\n", costWith); + JITDUMP(" Cost without: " FMT_WT "\n", costWithout); + + if (costWith < costWithout) + { + JITDUMP(" Promoting replacement\n"); + return true; + } + +#ifdef DEBUG + if (comp->compStressCompile(Compiler::STRESS_PHYSICAL_PROMOTION_COST, 25)) + { + JITDUMP(" Promoting replacement due to stress\n"); + return true; + } +#endif + + JITDUMP(" Disqualifying replacement\n"); + return false; + } + +#ifdef DEBUG + void Dump(unsigned lclNum) + { + if (m_accesses.size() <= 0) + { + return; + } + + printf("Accesses for V%02u\n", lclNum); + for (Access& access : m_accesses) + { + if (access.AccessType == TYP_STRUCT) + { + printf(" [%03u..%03u)\n", access.Offset, access.Offset + access.Layout->GetSize()); + } + else + { + printf(" %s @ %03u\n", varTypeName(access.AccessType), access.Offset); + } + + printf(" #: (%u, " FMT_WT ")\n", access.Count, access.CountWtd); + printf(" # assigned from: (%u, " FMT_WT ")\n", access.CountAssignmentSource, + access.CountAssignmentSourceWtd); + printf(" # assigned to: (%u, " FMT_WT ")\n", access.CountAssignmentDestination, + access.CountAssignmentDestinationWtd); + printf(" # as call arg: (%u, " FMT_WT ")\n", access.CountCallArgs, + access.CountCallArgsWtd); + printf(" # as retbuf: (%u, " FMT_WT ")\n", access.CountPassedAsRetbuf, + access.CountPassedAsRetbufWtd); + printf(" # as returned value: (%u, " FMT_WT ")\n\n", access.CountReturns, + access.CountReturnsWtd); + } + } +#endif +}; + +// Visitor that records information about uses of struct locals. +class LocalsUseVisitor : public GenTreeVisitor +{ + Promotion* m_prom; + LocalUses** m_uses; + BasicBlock* m_curBB = nullptr; + +public: + enum + { + DoPreOrder = true, + }; + + LocalsUseVisitor(Promotion* prom) : GenTreeVisitor(prom->m_compiler), m_prom(prom) + { + m_uses = new (prom->m_compiler, CMK_Promotion) LocalUses*[prom->m_compiler->lvaCount]{}; + } + + //------------------------------------------------------------------------ + // SetBB: + // Set current BB we are visiting. Used to get BB weights for access costing. + // + // Parameters: + // bb - The current basic block. + // + void SetBB(BasicBlock* bb) + { + m_curBB = bb; + } + + //------------------------------------------------------------------------ + // GetUsesByLocal: + // Get the uses information for a specified local. + // + // Parameters: + // bb - The current basic block. + // + // Returns: + // Information about uses, or null if this local has no uses information + // associated with it. + // + LocalUses* GetUsesByLocal(unsigned lcl) + { + return m_uses[lcl]; + } + + fgWalkResult PreOrderVisit(GenTree** use, GenTree* user) + { + GenTree* tree = *use; + + if (tree->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_LCL_ADDR)) + { + GenTreeLclVarCommon* lcl = tree->AsLclVarCommon(); + LclVarDsc* dsc = m_compiler->lvaGetDesc(lcl); + if (!dsc->lvPromoted && (dsc->TypeGet() == TYP_STRUCT) && !dsc->IsAddressExposed()) + { + var_types accessType; + ClassLayout* accessLayout; + AccessKindFlags accessFlags; + + if (lcl->OperIs(GT_LCL_ADDR)) + { + assert(user->OperIs(GT_CALL) && dsc->IsHiddenBufferStructArg() && + (user->AsCall()->gtArgs.GetRetBufferArg()->GetNode() == lcl)); + + accessType = TYP_STRUCT; + accessLayout = m_compiler->typGetObjLayout(user->AsCall()->gtRetClsHnd); + accessFlags = AccessKindFlags::IsCallRetBuf; + } + else + { + accessType = lcl->TypeGet(); + accessLayout = accessType == TYP_STRUCT ? lcl->GetLayout(m_compiler) : nullptr; + accessFlags = ClassifyLocalRead(lcl, user); + } + + LocalUses* uses = GetOrCreateUses(lcl->GetLclNum()); + unsigned offs = lcl->GetLclOffs(); + uses->RecordAccess(offs, accessType, accessLayout, accessFlags, m_curBB->getBBWeight(m_compiler)); + } + } + + return fgWalkResult::WALK_CONTINUE; + } + +private: + //------------------------------------------------------------------------ + // GetOrCreateUses: + // Get the uses information for a local. Create it if it does not already exist. + // + // Parameters: + // lclNum - The local + // + // Returns: + // Uses information. + // + LocalUses* GetOrCreateUses(unsigned lclNum) + { + if (m_uses[lclNum] == nullptr) + { + m_uses[lclNum] = new (m_compiler, CMK_Promotion) LocalUses(m_compiler); + } + + return m_uses[lclNum]; + } + //------------------------------------------------------------------------ + // ClassifyLocalAccess: + // Given a local use and its user, classify information about it. + // + // Parameters: + // lcl - The local + // user - The user of the local. + // + // Returns: + // Flags classifying the access. + // + AccessKindFlags ClassifyLocalRead(GenTreeLclVarCommon* lcl, GenTree* user) + { + assert(lcl->OperIsLocalRead()); + + AccessKindFlags flags = AccessKindFlags::None; + if (user->IsCall()) + { + GenTreeCall* call = user->AsCall(); + unsigned argIndex = 0; + for (CallArg& arg : call->gtArgs.Args()) + { + if (arg.GetNode() != lcl) + { + argIndex++; + continue; + } + + flags |= AccessKindFlags::IsCallArg; + + unsigned argSize = 0; + if (arg.GetSignatureType() != TYP_STRUCT) + { + argSize = genTypeSize(arg.GetSignatureType()); + } + else + { + argSize = m_compiler->typGetObjLayout(arg.GetSignatureClassHandle())->GetSize(); + } + + break; + } + } + + if (user->OperIs(GT_ASG)) + { + if (user->gtGetOp1() == lcl) + { + flags |= AccessKindFlags::IsAssignmentDestination; + } + + if (user->gtGetOp2() == lcl) + { + flags |= AccessKindFlags::IsAssignmentSource; + } + } + + if (user->OperIs(GT_RETURN)) + { + assert(user->gtGetOp1() == lcl); + flags |= AccessKindFlags::IsReturned; + } + + return flags; + } +}; + +class ReplaceVisitor : public GenTreeVisitor +{ + Promotion* m_prom; + jitstd::vector** m_replacements; + bool m_madeChanges = false; + +public: + enum + { + DoPostOrder = true, + UseExecutionOrder = true, + }; + + ReplaceVisitor(Promotion* prom, jitstd::vector** replacements) + : GenTreeVisitor(prom->m_compiler), m_prom(prom), m_replacements(replacements) + { + } + + bool MadeChanges() + { + return m_madeChanges; + } + + void Reset() + { + m_madeChanges = false; + } + + fgWalkResult PostOrderVisit(GenTree** use, GenTree* user) + { + GenTree* tree = *use; + + if (tree->OperIs(GT_ASG)) + { + // If LHS of the ASG was a local then we skipped it as we don't + // want to see it until after the RHS. + if (tree->gtGetOp1()->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + ReplaceLocal(&tree->AsOp()->gtOp1, tree); + } + + // Assignments can be decomposed directly into accesses of the replacements. + DecomposeAssignment((*use)->AsOp(), user); + return fgWalkResult::WALK_CONTINUE; + } + + if (tree->OperIs(GT_CALL)) + { + // Calls need to store replacements back into the struct local for args + // and need to restore replacements from the result (for + // retbufs/returns). + LoadStoreAroundCall((*use)->AsCall(), user); + return fgWalkResult::WALK_CONTINUE; + } + + if (tree->OperIs(GT_RETURN)) + { + // Returns need to store replacements back into the struct local. + StoreBeforeReturn((*use)->AsUnOp()); + return fgWalkResult::WALK_CONTINUE; + } + + // Skip the local on the LHS of ASGs when we see it in the normal tree + // visit; we handle it as part of the parent ASG instead. + if (tree->OperIs(GT_LCL_VAR, GT_LCL_FLD) && + ((user == nullptr) || !user->OperIs(GT_ASG) || (user->gtGetOp1() != tree))) + { + ReplaceLocal(use, user); + return fgWalkResult::WALK_CONTINUE; + } + + return fgWalkResult::WALK_CONTINUE; + } + + //------------------------------------------------------------------------ + // DecomposeAssignment: + // Handle an assignment that may be between struct locals with replacements. + // + // Parameters: + // asg - The assignment + // user - The user of the assignment. + // + void DecomposeAssignment(GenTreeOp* asg, GenTree* user) + { + // TODO-CQ: field-by-field copies and inits. + + if (asg->gtGetOp2()->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + GenTreeLclVarCommon* rhsLcl = asg->gtGetOp2()->AsLclVarCommon(); + if (rhsLcl->TypeIs(TYP_STRUCT)) + { + unsigned size = rhsLcl->GetLayout(m_compiler)->GetSize(); + WriteBackBefore(&asg->gtOp2, rhsLcl->GetLclNum(), rhsLcl->GetLclOffs(), size); + } + } + + if (asg->gtGetOp1()->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + GenTreeLclVarCommon* lhsLcl = asg->gtGetOp1()->AsLclVarCommon(); + if (lhsLcl->TypeIs(TYP_STRUCT)) + { + unsigned size = lhsLcl->GetLayout(m_compiler)->GetSize(); + MarkForReadBack(lhsLcl->GetLclNum(), lhsLcl->GetLclOffs(), size, true); + } + } + } + + //------------------------------------------------------------------------ + // LoadStoreAroundCall: + // Handle a call that may involve struct local arguments and that may + // pass a struct local with replacements as the retbuf. + // + // Parameters: + // call - The call + // user - The user of the call. + // + void LoadStoreAroundCall(GenTreeCall* call, GenTree* user) + { + CallArg* retBufArg = nullptr; + for (CallArg& arg : call->gtArgs.Args()) + { + if (arg.GetWellKnownArg() == WellKnownArg::RetBuffer) + { + retBufArg = &arg; + continue; + } + + if (!arg.GetNode()->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + continue; + } + + GenTreeLclVarCommon* argNodeLcl = arg.GetNode()->AsLclVarCommon(); + + if (argNodeLcl->TypeIs(TYP_STRUCT)) + { + unsigned size = argNodeLcl->GetLayout(m_compiler)->GetSize(); + WriteBackBefore(&arg.EarlyNodeRef(), argNodeLcl->GetLclNum(), argNodeLcl->GetLclOffs(), size); + } + } + + if (call->IsOptimizingRetBufAsLocal()) + { + assert(retBufArg != nullptr); + assert(retBufArg->GetNode()->OperIs(GT_LCL_ADDR)); + GenTreeLclVarCommon* retBufLcl = retBufArg->GetNode()->AsLclVarCommon(); + unsigned size = m_compiler->typGetObjLayout(call->gtRetClsHnd)->GetSize(); + + MarkForReadBack(retBufLcl->GetLclNum(), retBufLcl->GetLclOffs(), size); + } + } + + //------------------------------------------------------------------------ + // ReplaceLocal: + // Handle a local that may need to be replaced. + // + // Parameters: + // use - The use of the local + // user - The user of the local. + // + // Notes: + // This usually amounts to making a replacement like + // + // LCL_FLD int V00 [+8] -> LCL_VAR int V10. + // + // In some cases we may have a pending read back, meaning that the + // replacement local is out-of-date compared to the struct local. + // In that case we also need to insert IR to read it back. + // This happens for example if the struct local was just assigned from a + // call or via a block copy. + // + void ReplaceLocal(GenTree** use, GenTree* user) + { + GenTreeLclVarCommon* lcl = (*use)->AsLclVarCommon(); + unsigned lclNum = lcl->GetLclNum(); + if (m_replacements[lclNum] == nullptr) + { + return; + } + + jitstd::vector& replacements = *m_replacements[lclNum]; + + unsigned offs = lcl->GetLclOffs(); + var_types accessType = lcl->TypeGet(); + +#ifdef DEBUG + if (accessType == TYP_STRUCT) + { + assert((user == nullptr) || user->OperIs(GT_ASG, GT_CALL, GT_RETURN)); + } + else + { + ClassLayout* accessLayout = accessType == TYP_STRUCT ? lcl->GetLayout(m_compiler) : nullptr; + unsigned accessSize = accessLayout != nullptr ? accessLayout->GetSize() : genTypeSize(accessType); + for (const Replacement& rep : replacements) + { + assert(!rep.Overlaps(offs, accessSize) || ((rep.Offset == offs) && (rep.AccessType == accessType))); + } + + assert((accessType != TYP_STRUCT) || (accessLayout != nullptr)); + JITDUMP("Processing use [%06u] of V%02u.[%03u..%03u)\n", Compiler::dspTreeID(lcl), lclNum, offs, + offs + accessSize); + } +#endif + + if (accessType == TYP_STRUCT) + { + // Will be handled once we get to the parent. + return; + } + + size_t index = BinarySearch(replacements, offs); + if ((ssize_t)index < 0) + { + // Access that we don't have a replacement for. + return; + } + + Replacement& rep = replacements[index]; + assert(accessType == rep.AccessType); + JITDUMP(" ..replaced with promoted lcl V%02u\n", rep.LclNum); + *use = m_compiler->gtNewLclvNode(rep.LclNum, accessType); + + if ((lcl->gtFlags & GTF_VAR_DEF) != 0) + { + rep.NeedsWriteBack = true; + rep.NeedsReadBack = false; + } + else if (rep.NeedsReadBack) + { + GenTree* dst = m_compiler->gtNewLclvNode(rep.LclNum, rep.AccessType); + GenTree* src = m_compiler->gtNewLclFldNode(lclNum, rep.AccessType, rep.Offset); + *use = m_compiler->gtNewOperNode(GT_COMMA, (*use)->TypeGet(), m_compiler->gtNewAssignNode(dst, src), *use); + rep.NeedsReadBack = false; + + // TODO-CQ: Local copy prop does not take into account that the + // uses of LCL_VAR occur at the user, which means it may introduce + // illegally overlapping lifetimes, such as: + // + // └──▌ ADD int + // ├──▌ LCL_VAR int V10 tmp6 -> copy propagated to [V35 tmp31] + // └──▌ COMMA int + // ├──▌ ASG int + // │ ├──▌ LCL_VAR int V35 tmp31 + // │ └──▌ LCL_FLD int V03 loc1 [+4] + // This really ought to be handled by local copy prop, but the way it works during + // morph makes it hard to fix there. + // + // This is the short term fix. Long term fixes may be: + // 1. Fix local copy prop + // 2. Teach LSRA to allow the above cases, simplifying IR concepts (e.g. + // introduce something like GT_COPY on top of LCL_VAR when they + // need to be "defs") + // 3. Change the pass here to avoid creating any embedded assignments by making use + // of gtSplitTree. We will only need to split in very edge cases since the point + // at which the replacement was marked as needing read back is practically always + // going to be in a previous statement, so this shouldn't be too bad for CQ. + + m_compiler->lvaGetDesc(rep.LclNum)->lvRedefinedInEmbeddedStatement = true; + } + + m_madeChanges = true; + } + + //------------------------------------------------------------------------ + // StoreBeforeReturn: + // Handle a return of a potential struct local. + // + // Parameters: + // ret - The GT_RETURN node + // + void StoreBeforeReturn(GenTreeUnOp* ret) + { + if (ret->TypeIs(TYP_VOID) || !ret->gtGetOp1()->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + return; + } + + GenTreeLclVarCommon* retLcl = ret->gtGetOp1()->AsLclVarCommon(); + if (retLcl->TypeIs(TYP_STRUCT)) + { + unsigned size = retLcl->GetLayout(m_compiler)->GetSize(); + WriteBackBefore(&ret->gtOp1, retLcl->GetLclNum(), retLcl->GetLclOffs(), size); + } + } + + //------------------------------------------------------------------------ + // WriteBackBefore: + // Update the use with IR that writes back all necessary overlapping + // replacements into a struct local. + // + // Parameters: + // use - The use, which will be updated with a cascading comma trees of assignments + // lcl - The struct local + // offs - The starting offset into the struct local of the overlapping range to write back to + // size - The size of the overlapping range + // + void WriteBackBefore(GenTree** use, unsigned lcl, unsigned offs, unsigned size) + { + if (m_replacements[lcl] == nullptr) + { + return; + } + + jitstd::vector& replacements = *m_replacements[lcl]; + size_t index = BinarySearch(replacements, offs); + + if ((ssize_t)index < 0) + { + index = ~index; + if ((index > 0) && replacements[index - 1].Overlaps(offs, size)) + { + index--; + } + } + + unsigned end = offs + size; + while ((index < replacements.size()) && (replacements[index].Offset < end)) + { + Replacement& rep = replacements[index]; + if (rep.NeedsWriteBack) + { + GenTree* dst = m_compiler->gtNewLclFldNode(lcl, rep.AccessType, rep.Offset); + GenTree* src = m_compiler->gtNewLclvNode(rep.LclNum, rep.AccessType); + GenTreeOp* comma = + m_compiler->gtNewOperNode(GT_COMMA, (*use)->TypeGet(), m_compiler->gtNewAssignNode(dst, src), *use); + *use = comma; + use = &comma->gtOp2; + + rep.NeedsWriteBack = false; + m_madeChanges = true; + } + + index++; + } + } + + //------------------------------------------------------------------------ + // MarkForReadBack: + // Mark that replacements in the specified struct local need to be read + // back before their next use. + // + // Parameters: + // lcl - The struct local + // offs - The starting offset of the range in the struct local that needs to be read back from. + // size - The size of the range + // conservative - Whether this is a potentially conservative read back + // that we can handle more efficiently in the future (only used for + // logging purposes) + // + void MarkForReadBack(unsigned lcl, unsigned offs, unsigned size, bool conservative = false) + { + if (m_replacements[lcl] == nullptr) + { + return; + } + + jitstd::vector& replacements = *m_replacements[lcl]; + size_t index = BinarySearch(replacements, offs); + + if ((ssize_t)index < 0) + { + index = ~index; + if ((index > 0) && replacements[index - 1].Overlaps(offs, size)) + { + index--; + } + } + + unsigned end = offs + size; + while ((index < replacements.size()) && (replacements[index].Offset < end)) + { + Replacement& rep = replacements[index]; + assert(rep.Overlaps(offs, size)); + rep.NeedsReadBack = true; + rep.NeedsWriteBack = false; + index++; + + if (conservative) + { + JITDUMP("*** NYI: Conservatively marked as read-back\n"); + conservative = false; + } + } + } +}; + +//------------------------------------------------------------------------ +// Promotion::Run: +// Run the promotion phase. +// +// Returns: +// Suitable phase status. +// +PhaseStatus Promotion::Run() +{ + if (m_compiler->lvaCount <= 0) + { + return PhaseStatus::MODIFIED_NOTHING; + } + + // First collect information about uses of locals + LocalsUseVisitor localsUse(this); + for (BasicBlock* bb : m_compiler->Blocks()) + { + localsUse.SetBB(bb); + + for (Statement* stmt : bb->Statements()) + { + localsUse.WalkTree(stmt->GetRootNodePointer(), nullptr); + } + } + + unsigned numLocals = m_compiler->lvaCount; + +#ifdef DEBUG + if (m_compiler->verbose) + { + for (unsigned lcl = 0; lcl < m_compiler->lvaCount; lcl++) + { + LocalUses* uses = localsUse.GetUsesByLocal(lcl); + if (uses != nullptr) + { + uses->Dump(lcl); + } + } + } +#endif + + // Pick promotion based on the use information we just collected. + bool anyReplacements = false; + jitstd::vector** replacements = + new (m_compiler, CMK_Promotion) jitstd::vector*[m_compiler->lvaCount]{}; + for (unsigned i = 0; i < numLocals; i++) + { + LocalUses* uses = localsUse.GetUsesByLocal(i); + if (uses == nullptr) + { + continue; + } + + uses->PickPromotions(m_compiler, i, &replacements[i]); + + if (replacements[i] != nullptr) + { + assert(replacements[i]->size() > 0); + anyReplacements = true; +#ifdef DEBUG + JITDUMP("V%02u promoted with %d replacements\n", i, (int)replacements[i]->size()); + for (const Replacement& rep : *replacements[i]) + { + JITDUMP(" [%03u..%03u) promoted as %s V%02u\n", rep.Offset, rep.Offset + genTypeSize(rep.AccessType), + varTypeName(rep.AccessType), rep.LclNum); + } +#endif + } + } + + if (!anyReplacements) + { + return PhaseStatus::MODIFIED_NOTHING; + } + + // Make all replacements we decided on. + ReplaceVisitor replacer(this, replacements); + for (BasicBlock* bb : m_compiler->Blocks()) + { + for (Statement* stmt : bb->Statements()) + { + DISPSTMT(stmt); + replacer.Reset(); + replacer.WalkTree(stmt->GetRootNodePointer(), nullptr); + + if (replacer.MadeChanges()) + { + m_compiler->fgSequenceLocals(stmt); + m_compiler->gtUpdateStmtSideEffects(stmt); + JITDUMP("New statement:\n"); + DISPSTMT(stmt); + } + } + + for (unsigned i = 0; i < numLocals; i++) + { + if (replacements[i] == nullptr) + { + continue; + } + + for (Replacement& rep : *replacements[i]) + { + assert(!rep.NeedsReadBack || !rep.NeedsWriteBack); + if (rep.NeedsReadBack) + { + JITDUMP("Reading back replacement V%02u.[%03u..%03u) -> V%02u at the end of " FMT_BB "\n", i, + rep.Offset, rep.Offset + genTypeSize(rep.AccessType), rep.LclNum, bb->bbNum); + + GenTree* dst = m_compiler->gtNewLclvNode(rep.LclNum, rep.AccessType); + GenTree* src = m_compiler->gtNewLclFldNode(i, rep.AccessType, rep.Offset); + GenTree* asg = m_compiler->gtNewAssignNode(dst, src); + m_compiler->fgInsertStmtNearEnd(bb, m_compiler->fgNewStmtFromTree(asg)); + rep.NeedsReadBack = false; + } + + rep.NeedsWriteBack = true; + } + } + } + + // Insert initial IR to read arguments/OSR locals into replacement locals, + // and add necessary explicit zeroing. + Statement* prevStmt = nullptr; + for (unsigned lclNum = 0; lclNum < numLocals; lclNum++) + { + if (replacements[lclNum] == nullptr) + { + continue; + } + + LclVarDsc* dsc = m_compiler->lvaGetDesc(lclNum); + if (dsc->lvIsParam || dsc->lvIsOSRLocal) + { + InsertInitialReadBack(lclNum, *replacements[lclNum], &prevStmt); + } + else if (dsc->lvSuppressedZeroInit) + { + // We may have suppressed inserting an explicit zero init based on the + // assumption that the entire local will be zero inited in the prolog. + // Now that we are promoting some fields that assumption may be + // invalidated for those fields, and we may need to insert explicit + // zero inits again. + ExplicitlyZeroInitReplacementLocals(lclNum, *replacements[lclNum], &prevStmt); + } + } + + return PhaseStatus::MODIFIED_EVERYTHING; +} + +//------------------------------------------------------------------------ +// Promotion::InsertInitialReadBack: +// Insert IR to initially read a struct local's value into its promoted field locals. +// +// Parameters: +// lclNum - The struct local +// replacements - Replacements for the struct local +// prevStmt - [in, out] Previous statement to insert after +// +void Promotion::InsertInitialReadBack(unsigned lclNum, + const jitstd::vector& replacements, + Statement** prevStmt) +{ + for (unsigned i = 0; i < replacements.size(); i++) + { + const Replacement& rep = replacements[i]; + + GenTree* dst = m_compiler->gtNewLclvNode(rep.LclNum, rep.AccessType); + GenTree* src = m_compiler->gtNewLclFldNode(lclNum, rep.AccessType, rep.Offset); + GenTree* asg = m_compiler->gtNewAssignNode(dst, src); + InsertInitStatement(prevStmt, asg); + } +} + +//------------------------------------------------------------------------ +// Promotion::ExplicitlyZeroInitReplacementLocals: +// Insert IR to zero out replacement locals if necessary. +// +// Parameters: +// lclNum - The struct local +// replacements - Replacements for the struct local +// prevStmt - [in, out] Previous statement to insert after +// +void Promotion::ExplicitlyZeroInitReplacementLocals(unsigned lclNum, + const jitstd::vector& replacements, + Statement** prevStmt) +{ + for (unsigned i = 0; i < replacements.size(); i++) + { + const Replacement& rep = replacements[i]; + + if (!m_compiler->fgVarNeedsExplicitZeroInit(rep.LclNum, false, false)) + { + // Other downstream code (e.g. recursive-tailcalls-to-loops opt) may + // still need to insert further explicit zero initing. + m_compiler->lvaGetDesc(rep.LclNum)->lvSuppressedZeroInit = true; + continue; + } + + GenTree* dst = m_compiler->gtNewLclvNode(rep.LclNum, rep.AccessType); + GenTree* src = m_compiler->gtNewZeroConNode(rep.AccessType); + GenTree* asg = m_compiler->gtNewAssignNode(dst, src); + InsertInitStatement(prevStmt, asg); + } +} + +//------------------------------------------------------------------------ +// Promotion::InsertInitStatement: +// Insert a new statement after the specified statement in the scratch block, +// or at the beginning of the scratch block if no other statements were +// inserted yet. +// +// Parameters: +// prevStmt - [in, out] Previous statement to insert after +// tree - Tree to create statement from +// +void Promotion::InsertInitStatement(Statement** prevStmt, GenTree* tree) +{ + m_compiler->fgEnsureFirstBBisScratch(); + Statement* stmt = m_compiler->fgNewStmtFromTree(tree); + if (*prevStmt != nullptr) + { + m_compiler->fgInsertStmtAfter(m_compiler->fgFirstBB, *prevStmt, stmt); + } + else + { + m_compiler->fgInsertStmtAtBeg(m_compiler->fgFirstBB, stmt); + } + + *prevStmt = stmt; +} diff --git a/src/coreclr/jit/promotion.h b/src/coreclr/jit/promotion.h new file mode 100644 index 00000000000000..2ae42e3312c0f8 --- /dev/null +++ b/src/coreclr/jit/promotion.h @@ -0,0 +1,33 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#ifndef _PROMOTION_H +#define _PROMOTION_H + +#include "compiler.h" +#include "vector.h" + +struct Replacement; + +class Promotion +{ + Compiler* m_compiler; + + friend class LocalsUseVisitor; + friend class ReplaceVisitor; + + void InsertInitialReadBack(unsigned lclNum, const jitstd::vector& replacements, Statement** prevStmt); + void ExplicitlyZeroInitReplacementLocals(unsigned lclNum, + const jitstd::vector& replacements, + Statement** prevStmt); + void InsertInitStatement(Statement** prevStmt, GenTree* tree); + +public: + explicit Promotion(Compiler* compiler) : m_compiler(compiler) + { + } + + PhaseStatus Run(); +}; + +#endif diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index eb3656263eadc0..f745c2d1285852 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -49,6 +49,7 @@ DOTNET_JitStress; DOTNET_JitStressProcedureSplitting; DOTNET_JitStressRegs; + DOTNET_JitStressModeNames; DOTNET_TailcallStress; DOTNET_ReadyToRun; DOTNET_ZapDisable; @@ -214,6 +215,8 @@ + +