diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp index fa3d627feadcc8..990100ec8e3bbc 100644 --- a/src/coreclr/jit/inductionvariableopts.cpp +++ b/src/coreclr/jit/inductionvariableopts.cpp @@ -1341,7 +1341,11 @@ class StrengthReductionContext bool CheckAdvancedCursors(ArrayStack* cursors, ScevAddRec** nextIV); bool StaysWithinManagedObject(ArrayStack* cursors, ScevAddRec* addRec); bool TryReplaceUsesWithNewPrimaryIV(ArrayStack* cursors, ScevAddRec* iv); - BasicBlock* FindUpdateInsertionPoint(ArrayStack* cursors); + BasicBlock* FindUpdateInsertionPoint(ArrayStack* cursors, Statement** afterStmt); + BasicBlock* FindPostUseUpdateInsertionPoint(ArrayStack* cursors, + BasicBlock* backEdgeDominator, + Statement** afterStmt); + bool InsertionPointPostDominatesUses(BasicBlock* insertionPoint, ArrayStack* cursors); bool StressProfitability() { @@ -2000,7 +2004,8 @@ bool StrengthReductionContext::TryReplaceUsesWithNewPrimaryIV(ArrayStackgtNewOperNode(GT_ADD, iv->Type, m_comp->gtNewLclVarNode(newPrimaryIV, iv->Type), stepValue); GenTree* stepStore = m_comp->gtNewTempStore(newPrimaryIV, nextValue); Statement* stepStmt = m_comp->fgNewStmtFromTree(stepStore); - m_comp->fgInsertStmtNearEnd(insertionPoint, stepStmt); + if (afterStmt != nullptr) + { + m_comp->fgInsertStmtAfter(insertionPoint, afterStmt, stepStmt); + } + else + { + m_comp->fgInsertStmtNearEnd(insertionPoint, stepStmt); + } JITDUMP(" Inserting step statement in " FMT_BB "\n", insertionPoint->bbNum); DISPSTMT(stepStmt); @@ -2084,22 +2096,27 @@ bool StrengthReductionContext::TryReplaceUsesWithNewPrimaryIV(ArrayStack* cursors) +BasicBlock* StrengthReductionContext::FindUpdateInsertionPoint(ArrayStack* cursors, Statement** afterStmt) { + *afterStmt = nullptr; + // Find insertion point. It needs to post-dominate all uses we are going to // replace and it needs to dominate all backedges. // TODO-CQ: Canonicalizing backedges would make this simpler and work in // more cases. BasicBlock* insertionPoint = nullptr; + for (FlowEdge* backEdge : m_loop->BackEdges()) { if (insertionPoint == nullptr) @@ -2112,6 +2129,18 @@ BasicBlock* StrengthReductionContext::FindUpdateInsertionPoint(ArrayStackbbNum, (*afterStmt)->GetID()); + return postUseInsertionPoint; + } +#endif + while ((insertionPoint != nullptr) && m_loop->ContainsBlock(insertionPoint) && m_loop->MayExecuteBlockMultipleTimesPerIteration(insertionPoint)) { @@ -2123,6 +2152,124 @@ BasicBlock* StrengthReductionContext::FindUpdateInsertionPoint(ArrayStackbbNum); + return insertionPoint; +} + +//------------------------------------------------------------------------ +// FindPostUseUpdateInsertionPoint: Try finding an insertion point for the IV +// update that is right after one of the uses of it. +// +// Parameters: +// cursors - The list of cursors pointing to uses that are being replaced by +// the new IV +// backEdgeDominator - A basic block that dominates all backedges +// afterStmt - [out] Statement to insert the update after, if the +// return value is non-null. +// +// Returns: +// nullptr if no such insertion point could be found. Otherwise returns the +// basic block and statement after which the update can be inserted. +// +BasicBlock* StrengthReductionContext::FindPostUseUpdateInsertionPoint(ArrayStack* cursors, + BasicBlock* backEdgeDominator, + Statement** afterStmt) +{ + BitVecTraits poTraits = m_loop->GetDfsTree()->PostOrderTraits(); + +#ifdef DEBUG + // We will be relying on the fact that the cursors are ordered in a useful + // way here: loop locals are visited in post order within each basic block, + // meaning that "cursors" has the last uses first for each basic block. + // Assert that here. + + BitVec seenBlocks(BitVecOps::MakeEmpty(&poTraits)); + for (int i = 1; i < cursors->Height(); i++) + { + CursorInfo& prevCursor = cursors->BottomRef(i - 1); + CursorInfo& cursor = cursors->BottomRef(i); + + if (cursor.Block != prevCursor.Block) + { + assert(BitVecOps::TryAddElemD(&poTraits, seenBlocks, prevCursor.Block->bbPostorderNum)); + continue; + } + + Statement* curStmt = cursor.Stmt; + while ((curStmt != nullptr) && (curStmt != prevCursor.Stmt)) + { + curStmt = curStmt->GetNextStmt(); + } + + assert(curStmt == prevCursor.Stmt); + } +#endif + + BitVec blocksWithUses(BitVecOps::MakeEmpty(&poTraits)); + for (int i = 0; i < cursors->Height(); i++) + { + CursorInfo& cursor = cursors->BottomRef(i); + BitVecOps::AddElemD(&poTraits, blocksWithUses, cursor.Block->bbPostorderNum); + } + + while ((backEdgeDominator != nullptr) && m_loop->ContainsBlock(backEdgeDominator)) + { + if (!BitVecOps::IsMember(&poTraits, blocksWithUses, backEdgeDominator->bbPostorderNum)) + { + backEdgeDominator = backEdgeDominator->bbIDom; + continue; + } + + if (m_loop->MayExecuteBlockMultipleTimesPerIteration(backEdgeDominator)) + { + return nullptr; + } + + for (int i = 0; i < cursors->Height(); i++) + { + CursorInfo& cursor = cursors->BottomRef(i); + if (cursor.Block != backEdgeDominator) + { + continue; + } + + if (!InsertionPointPostDominatesUses(cursor.Block, cursors)) + { + return nullptr; + } + + *afterStmt = cursor.Stmt; + return cursor.Block; + } + } + + return nullptr; +} + +//------------------------------------------------------------------------ +// InsertionPointPostDominatesUses: Check if a basic block post-dominates all +// locations specified by the cursors. +// +// Parameters: +// insertionPoint - The insertion point +// cursors - Cursors specifying locations +// +// Returns: +// True if so. +// +// Remarks: +// For cursors inside "insertionPoint", the function expects that the +// insertion point is _after_ the use, except if the use is in a terminator +// statement. +// +bool StrengthReductionContext::InsertionPointPostDominatesUses(BasicBlock* insertionPoint, + ArrayStack* cursors) +{ for (int i = 0; i < cursors->Height(); i++) { CursorInfo& cursor = cursors->BottomRef(i); @@ -2131,19 +2278,19 @@ BasicBlock* StrengthReductionContext::FindUpdateInsertionPoint(ArrayStackHasTerminator() && (cursor.Stmt == insertionPoint->lastStmt())) { - return nullptr; + return false; } } else { if (!m_loop->IsPostDominatedOnLoopIteration(cursor.Block, insertionPoint)) { - return nullptr; + return false; } } } - return insertionPoint; + return true; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index d57d1b893d68af..acf3f052f62d21 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -632,8 +632,7 @@ GenTree* Lowering::LowerNode(GenTree* node) FALLTHROUGH; case GT_STORE_LCL_FLD: - LowerStoreLocCommon(node->AsLclVarCommon()); - break; + return LowerStoreLocCommon(node->AsLclVarCommon()); #if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) case GT_CMPXCHG: @@ -4783,7 +4782,10 @@ void Lowering::LowerRet(GenTreeOp* ret) // Arguments: // lclStore - The store lcl node to lower. // -void Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) +// Returns: +// Next node to lower. +// +GenTree* Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) { assert(lclStore->OperIs(GT_STORE_LCL_FLD, GT_STORE_LCL_VAR)); JITDUMP("lowering store lcl var/field (before):\n"); @@ -4870,8 +4872,7 @@ void Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) lclStore->gtOp1 = spilledCall; src = lclStore->gtOp1; JITDUMP("lowering store lcl var/field has to spill call src.\n"); - LowerStoreLocCommon(lclStore); - return; + return LowerStoreLocCommon(lclStore); } #endif // !WINDOWS_AMD64_ABI convertToStoreObj = false; @@ -4966,7 +4967,7 @@ void Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) DISPTREERANGE(BlockRange(), objStore); JITDUMP("\n"); - return; + return objStore->gtNext; } } @@ -4984,11 +4985,13 @@ void Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) ContainCheckBitCast(bitcast); } - LowerStoreLoc(lclStore); + GenTree* next = LowerStoreLoc(lclStore); JITDUMP("lowering store lcl var/field (after):\n"); DISPTREERANGE(BlockRange(), lclStore); JITDUMP("\n"); + + return next; } //---------------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 81e337abdaf668..43cb09d96986bd 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -157,7 +157,7 @@ class Lowering final : public Phase GenTreeCC* LowerNodeCC(GenTree* node, GenCondition condition); void LowerJmpMethod(GenTree* jmp); void LowerRet(GenTreeOp* ret); - void LowerStoreLocCommon(GenTreeLclVarCommon* lclVar); + GenTree* LowerStoreLocCommon(GenTreeLclVarCommon* lclVar); void LowerRetStruct(GenTreeUnOp* ret); void LowerRetSingleRegStructLclVar(GenTreeUnOp* ret); void LowerCallStruct(GenTreeCall* call); @@ -353,6 +353,8 @@ class Lowering final : public Phase GenTree* LowerIndir(GenTreeIndir* ind); bool OptimizeForLdpStp(GenTreeIndir* ind); bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir); + bool TryMoveAddSubRMWAfterIndir(GenTreeLclVarCommon* store); + bool TryMakeIndirAndStoreAdjacent(GenTreeIndir* prevIndir, GenTreeLclVarCommon* store); void MarkTree(GenTree* root); void UnmarkTree(GenTree* root); GenTree* LowerStoreIndir(GenTreeStoreInd* node); @@ -401,11 +403,11 @@ class Lowering final : public Phase bool LowerRMWMemOp(GenTreeIndir* storeInd); #endif - void WidenSIMD12IfNecessary(GenTreeLclVarCommon* node); - bool CheckMultiRegLclVar(GenTreeLclVar* lclNode, int registerCount); - void LowerStoreLoc(GenTreeLclVarCommon* tree); - void LowerRotate(GenTree* tree); - void LowerShift(GenTreeOp* shift); + void WidenSIMD12IfNecessary(GenTreeLclVarCommon* node); + bool CheckMultiRegLclVar(GenTreeLclVar* lclNode, int registerCount); + GenTree* LowerStoreLoc(GenTreeLclVarCommon* tree); + void LowerRotate(GenTree* tree); + void LowerShift(GenTreeOp* shift); #ifdef FEATURE_HW_INTRINSICS GenTree* LowerHWIntrinsic(GenTreeHWIntrinsic* node); void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 855c4f21917e50..7962fd4c11fb68 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -472,7 +472,10 @@ bool Lowering::IsContainableUnaryOrBinaryOp(GenTree* parentNode, GenTree* childN // This involves: // - Widening small stores (on ARM). // -void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) +// Returns: +// Next node to lower. +// +GenTree* Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) { #ifdef TARGET_ARM // On ARM, small stores can cost a bit more in terms of code size so we try to widen them. This is legal @@ -495,6 +498,17 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) } ContainCheckStoreLoc(storeLoc); + + GenTree* next = storeLoc->gtNext; + +#ifdef TARGET_ARM64 + if (comp->opts.OptimizationEnabled()) + { + TryMoveAddSubRMWAfterIndir(storeLoc); + } +#endif + + return next; } //------------------------------------------------------------------------ @@ -1053,6 +1067,203 @@ void Lowering::LowerModPow2(GenTree* node) ContainCheckNode(mod); } +const int POST_INDEXED_ADDRESSING_MAX_DISTANCE = 16; + +//------------------------------------------------------------------------ +// TryMoveAddSubRMWAfterIndir: Try to move an RMW update of a local with an +// ADD/SUB operand earlier to happen right after an indirection on the same +// local, attempting to make these combinable intro post-indexed addressing. +// +// Arguments: +// store - The store to a local +// +// Return Value: +// True if the store was moved; otherwise false. +// +bool Lowering::TryMoveAddSubRMWAfterIndir(GenTreeLclVarCommon* store) +{ + if (!store->OperIs(GT_STORE_LCL_VAR)) + { + return false; + } + + unsigned lclNum = store->GetLclNum(); + if (comp->lvaGetDesc(lclNum)->lvDoNotEnregister) + { + return false; + } + + GenTree* data = store->Data(); + if (!data->OperIs(GT_ADD, GT_SUB) || data->gtOverflow()) + { + return false; + } + + GenTree* op1 = data->gtGetOp1(); + GenTree* op2 = data->gtGetOp2(); + if (!op1->OperIs(GT_LCL_VAR) || !op2->isContainedIntOrIImmed()) + { + return false; + } + + if (op1->AsLclVarCommon()->GetLclNum() != lclNum) + { + return false; + } + + int maxCount = min(m_blockIndirs.Height(), POST_INDEXED_ADDRESSING_MAX_DISTANCE / 2); + for (int i = 0; i < maxCount; i++) + { + SavedIndir& prev = m_blockIndirs.TopRef(i); + if ((prev.AddrBase->GetLclNum() != lclNum) || (prev.Offset != 0)) + { + continue; + } + + GenTreeIndir* prevIndir = prev.Indir; + if ((prevIndir == nullptr) || (prevIndir->gtNext == nullptr)) + { + continue; + } + + JITDUMP( + "[%06u] is an an RMW ADD/SUB on local V%02u which is used as the address to [%06u]. Trying to make them adjacent.\n", + Compiler::dspTreeID(store), lclNum, Compiler::dspTreeID(prevIndir)); + + if (TryMakeIndirAndStoreAdjacent(prevIndir, store)) + { + prev.Indir = nullptr; + return true; + } + } + + return false; +} + +//------------------------------------------------------------------------ +// TryMakeIndirAndStoreAdjacent: Try to move a store earlier, right after the +// specified indirection. +// +// Arguments: +// prevIndir - Indirection that comes before "store" +// store - Store that we want to happen next to the indirection +// +// Return Value: +// True if the store was moved; otherwise false. +// +bool Lowering::TryMakeIndirAndStoreAdjacent(GenTreeIndir* prevIndir, GenTreeLclVarCommon* store) +{ + GenTree* cur = prevIndir; + for (int i = 0; i < POST_INDEXED_ADDRESSING_MAX_DISTANCE; i++) + { + cur = cur->gtNext; + if (cur == store) + break; + } + + if (cur != store) + { + JITDUMP(" Too far separated, giving up\n"); + return false; + } + + JITDUMP(" They are close. Trying to move the following range (where * are nodes part of the data flow):\n\n"); +#ifdef DEBUG + bool isClosed; + GenTree* startDumpNode = BlockRange().GetTreeRange(prevIndir, &isClosed).FirstNode(); + GenTree* endDumpNode = store->gtNext; + + auto dumpWithMarks = [=]() { + if (!comp->verbose) + { + return; + } + + for (GenTree* node = startDumpNode; node != endDumpNode; node = node->gtNext) + { + const char* prefix; + if (node == prevIndir) + prefix = "1. "; + else if (node == store) + prefix = "2. "; + else if ((node->gtLIRFlags & LIR::Flags::Mark) != 0) + prefix = "* "; + else + prefix = " "; + + comp->gtDispLIRNode(node, prefix); + } + }; + +#endif + + MarkTree(store); + + INDEBUG(dumpWithMarks()); + JITDUMP("\n"); + + assert((prevIndir->gtLIRFlags & LIR::Flags::Mark) == 0); + m_scratchSideEffects.Clear(); + + for (GenTree* cur = prevIndir->gtNext; cur != store; cur = cur->gtNext) + { + if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0) + { + // 'cur' is part of data flow of 'store', so we will be moving the + // currently recorded effects past 'cur'. + if (m_scratchSideEffects.InterferesWith(comp, cur, true)) + { + JITDUMP("Giving up due to interference with [%06u]\n", Compiler::dspTreeID(cur)); + UnmarkTree(store); + return false; + } + } + else + { + // Not part of dataflow; add its effects that will move past + // 'store'. + m_scratchSideEffects.AddNode(comp, cur); + } + } + + if (m_scratchSideEffects.InterferesWith(comp, store, true)) + { + JITDUMP("Have interference. Giving up.\n"); + UnmarkTree(store); + return false; + } + + JITDUMP("Interference checks passed. Moving nodes that are not part of data flow of [%06u]\n\n", + Compiler::dspTreeID(store)); + + GenTree* previous = prevIndir; + for (GenTree* node = prevIndir->gtNext;;) + { + GenTree* next = node->gtNext; + + if ((node->gtLIRFlags & LIR::Flags::Mark) != 0) + { + // Part of data flow. Move it to happen right after 'previous'. + BlockRange().Remove(node); + BlockRange().InsertAfter(previous, node); + previous = node; + } + + if (node == store) + { + break; + } + + node = next; + } + + JITDUMP("Result:\n\n"); + INDEBUG(dumpWithMarks()); + JITDUMP("\n"); + UnmarkTree(store); + return true; +} + //------------------------------------------------------------------------ // LowerAddForPossibleContainment: Tries to lower GT_ADD in such a way // that would allow one of its operands diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index ffdf11c44ac37e..72435464dbb099 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -249,7 +249,10 @@ GenTree* Lowering::LowerBinaryArithmetic(GenTreeOp* binOp) // This involves: // - Widening operations of unsigneds. // -void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) +// Returns: +// Next node to lower. +// +GenTree* Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) { if (storeLoc->OperIs(GT_STORE_LCL_FLD)) { @@ -257,6 +260,7 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) verifyLclFldDoNotEnregister(storeLoc->GetLclNum()); } ContainCheckStoreLoc(storeLoc); + return storeLoc->gtNext; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerriscv64.cpp b/src/coreclr/jit/lowerriscv64.cpp index 8b74b1cc317813..55c4182430461d 100644 --- a/src/coreclr/jit/lowerriscv64.cpp +++ b/src/coreclr/jit/lowerriscv64.cpp @@ -198,7 +198,10 @@ GenTree* Lowering::LowerBinaryArithmetic(GenTreeOp* binOp) // This involves: // - Widening operations of unsigneds. // -void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) +// Returns: +// Next node to lower. +// +GenTree* Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) { if (storeLoc->OperIs(GT_STORE_LCL_FLD)) { @@ -206,6 +209,7 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) verifyLclFldDoNotEnregister(storeLoc->GetLclNum()); } ContainCheckStoreLoc(storeLoc); + return storeLoc->gtNext; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index db997aa3f426cb..8b199a898d4419 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -43,7 +43,10 @@ void Lowering::LowerRotate(GenTree* tree) // - Handling of contained immediates. // - Widening some small stores. // -void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) +// Returns: +// Next tree to lower. +// +GenTree* Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) { // Most small locals (the exception is dependently promoted fields) have 4 byte wide stack slots, so // we can widen the store, if profitable. The widening is only (largely) profitable for 2 byte stores. @@ -64,6 +67,7 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) } ContainCheckStoreLoc(storeLoc); + return storeLoc->gtNext; } //------------------------------------------------------------------------