diff --git a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp index 78b0a464269541..691f9973ce2626 100644 --- a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp +++ b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp @@ -1609,7 +1609,21 @@ void MyICJI::allocMem(ULONG hotCodeSize, /* IN */ jitInstance->mc->cr->AddCall("allocMem"); // TODO-Cleanup: Could hot block size be ever 0? - *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize); + size_t codeAlignment = sizeof(void*); + size_t hotCodeAlignedSize = static_cast(hotCodeSize); + + if ((flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) + { + codeAlignment = 32; + } + else if ((flag & CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN) != 0) + { + codeAlignment = 16; + } + hotCodeAlignedSize = ALIGN_UP_SPMI(hotCodeAlignedSize, codeAlignment); + hotCodeAlignedSize = hotCodeAlignedSize + (codeAlignment - sizeof(void*)); + *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeAlignedSize); + *hotCodeBlock = ALIGN_UP_SPMI(*hotCodeBlock, codeAlignment); if (coldCodeSize > 0) *coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize); diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index fb0d859f8db8b7..6ddd274ac9fce8 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -302,7 +302,6 @@ RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_UseIBCFile, W("UseIBCFile"), 0, "", CLRConf /// /// JIT /// -RETAIL_CONFIG_DWORD_INFO_DIRECT_ACCESS(UNSUPPORTED_JitAlignLoops, W("JitAlignLoops"), "Aligns loop targets to 8 byte boundaries") CONFIG_DWORD_INFO_EX(INTERNAL_JitBreakEmit, W("JitBreakEmit"), (DWORD)-1, "", CLRConfig::EEConfig_default) CONFIG_DWORD_INFO_DIRECT_ACCESS(INTERNAL_JitDebuggable, W("JitDebuggable"), "") #if !defined(DEBUG) && !defined(_DEBUG) diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h index 83cbc20be8863a..5cea8a224c609d 100644 --- a/src/coreclr/inc/corjitflags.h +++ b/src/coreclr/inc/corjitflags.h @@ -79,45 +79,45 @@ class CORJIT_FLAGS CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame - CORJIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries + CORJIT_FLAG_UNUSED12 = 32, CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0. (used by IL stubs) - CORJIT_FLAG_UNUSED12 = 34, + CORJIT_FLAG_UNUSED13 = 34, CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background CORJIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions CORJIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog - CORJIT_FLAG_UNUSED13 = 38, + CORJIT_FLAG_UNUSED14 = 38, CORJIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible CORJIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code #if defined(TARGET_ARM) CORJIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records #else // !defined(TARGET_ARM) - CORJIT_FLAG_UNUSED14 = 41, + CORJIT_FLAG_UNUSED15 = 41, #endif // !defined(TARGET_ARM) CORJIT_FLAG_NO_INLINING = 42, // JIT should not inline any called method into this method - CORJIT_FLAG_UNUSED15 = 43, - CORJIT_FLAG_UNUSED16 = 44, - CORJIT_FLAG_UNUSED17 = 45, - CORJIT_FLAG_UNUSED18 = 46, - CORJIT_FLAG_UNUSED19 = 47, - CORJIT_FLAG_UNUSED20 = 48, - CORJIT_FLAG_UNUSED21 = 49, - CORJIT_FLAG_UNUSED22 = 50, - CORJIT_FLAG_UNUSED23 = 51, - CORJIT_FLAG_UNUSED24 = 52, - CORJIT_FLAG_UNUSED25 = 53, - CORJIT_FLAG_UNUSED26 = 54, - CORJIT_FLAG_UNUSED27 = 55, - CORJIT_FLAG_UNUSED28 = 56, - CORJIT_FLAG_UNUSED29 = 57, - CORJIT_FLAG_UNUSED30 = 58, - CORJIT_FLAG_UNUSED31 = 59, - CORJIT_FLAG_UNUSED32 = 60, - CORJIT_FLAG_UNUSED33 = 61, - CORJIT_FLAG_UNUSED34 = 62, - CORJIT_FLAG_UNUSED35 = 63 + CORJIT_FLAG_UNUSED16 = 43, + CORJIT_FLAG_UNUSED17 = 44, + CORJIT_FLAG_UNUSED18 = 45, + CORJIT_FLAG_UNUSED19 = 46, + CORJIT_FLAG_UNUSED20 = 47, + CORJIT_FLAG_UNUSED21 = 48, + CORJIT_FLAG_UNUSED22 = 49, + CORJIT_FLAG_UNUSED23 = 50, + CORJIT_FLAG_UNUSED24 = 51, + CORJIT_FLAG_UNUSED25 = 52, + CORJIT_FLAG_UNUSED26 = 53, + CORJIT_FLAG_UNUSED27 = 54, + CORJIT_FLAG_UNUSED28 = 55, + CORJIT_FLAG_UNUSED29 = 56, + CORJIT_FLAG_UNUSED30 = 57, + CORJIT_FLAG_UNUSED31 = 58, + CORJIT_FLAG_UNUSED32 = 59, + CORJIT_FLAG_UNUSED33 = 60, + CORJIT_FLAG_UNUSED34 = 61, + CORJIT_FLAG_UNUSED35 = 62, + CORJIT_FLAG_UNUSED36 = 63 }; CORJIT_FLAGS() diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index 6ee29b5a00fae6..e67969b5222d5a 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -31,11 +31,11 @@ // ////////////////////////////////////////////////////////////////////////////////////////////////////////// -constexpr GUID JITEEVersionIdentifier = { /* 8e32c24d-62fe-4d78-ae73-eedddb928ee2 */ - 0x8e32c24d, - 0x62fe, - 0x4d78, - {0xae, 0x73, 0xee, 0xdd, 0xdb, 0x92, 0x8e, 0xe2} +constexpr GUID JITEEVersionIdentifier = { /* de81f48e-7701-45f2-a91b-1914f88dfd11 */ + 0xde81f48e, + 0x7701, + 0x45f2, + {0xa9, 0x1b, 0x19, 0x14, 0xf8, 0x8d, 0xfd, 0x11} }; ////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index f2b14599335f5f..6cea8dd2c367a2 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -505,6 +505,10 @@ void BasicBlock::dspFlags() { printf("cfe "); } + if (bbFlags & BBF_LOOP_ALIGN) + { + printf("align "); + } } /***************************************************************************** diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 02c37361e831ce..d92f5b2c3550c1 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -448,6 +448,7 @@ struct BasicBlock : private LIR::Range #define BBF_PATCHPOINT MAKE_BBFLAG(36) // Block is a patchpoint #define BBF_HAS_CLASS_PROFILE MAKE_BBFLAG(37) // BB contains a call needing a class profile +#define BBF_LOOP_ALIGN MAKE_BBFLAG(39) // Block is lexically the first block in a loop we intend to align. // clang-format on @@ -463,6 +464,10 @@ struct BasicBlock : private LIR::Range { return ((bbFlags & BBF_LOOP_HEAD) != 0); } + bool isLoopAlign() const + { + return ((bbFlags & BBF_LOOP_ALIGN) != 0); + } // Flags to update when two blocks are compacted diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index d6eebc9d416152..8c4572dcec43f5 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -2258,6 +2258,12 @@ void CodeGen::genGenerateMachineCode() GetEmitter()->emitJumpDistBind(); +#if FEATURE_LOOP_ALIGN + /* Perform alignment adjustments */ + + GetEmitter()->emitLoopAlignAdjustments(); +#endif + /* The code is now complete and final; it should not change after this. */ } @@ -2345,10 +2351,12 @@ void CodeGen::genEmitMachineCode() #ifdef DEBUG if (compiler->opts.disAsm || verbose) { - printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d (MethodHash=%08x) for " + printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d, allocated bytes for " + "code %d (MethodHash=%08x) for " "method %s\n", - codeSize, prologSize, compiler->info.compPerfScore, instrCount, compiler->info.compMethodHash(), - compiler->info.compFullName); + codeSize, prologSize, compiler->info.compPerfScore, instrCount, + GetEmitter()->emitTotalHotCodeSize + GetEmitter()->emitTotalColdCodeSize, + compiler->info.compMethodHash(), compiler->info.compFullName); printf("; ============================================================\n\n"); printf(""); // in our logic this causes a flush } diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index bf8d1ce087adf9..215e3c04f75b59 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -311,13 +311,6 @@ void CodeGen::genCodeForBBlist() genUpdateCurrentFunclet(block); -#ifdef TARGET_XARCH - if (ShouldAlignLoops() && block->bbFlags & BBF_LOOP_HEAD) - { - GetEmitter()->emitLoopAlign(); - } -#endif - genLogLabel(block); // Tell everyone which basic block we're working on @@ -356,6 +349,14 @@ void CodeGen::genCodeForBBlist() needLabel = true; } +#if FEATURE_LOOP_ALIGN + if (GetEmitter()->emitEndsWithAlignInstr()) + { + // we had better be planning on starting a new IG + assert(needLabel); + } +#endif + if (needLabel) { // Mark a label and update the current set of live GC refs @@ -667,10 +668,6 @@ void CodeGen::genCodeForBBlist() switch (block->bbJumpKind) { - case BBJ_ALWAYS: - inst_JMP(EJ_jmp, block->bbJumpDest); - break; - case BBJ_RETURN: genExitCode(block); break; @@ -741,15 +738,55 @@ void CodeGen::genCodeForBBlist() #endif // !FEATURE_EH_FUNCLETS case BBJ_NONE: - case BBJ_COND: case BBJ_SWITCH: break; + case BBJ_ALWAYS: + inst_JMP(EJ_jmp, block->bbJumpDest); + FALLTHROUGH; + + case BBJ_COND: + +#if FEATURE_LOOP_ALIGN + // This is the last place where we operate on blocks and after this, we operate + // on IG. Hence, if we know that the destination of "block" is the first block + // of a loop and needs alignment (it has BBF_LOOP_ALIGN), then "block" represents + // end of the loop. Propagate that information on the IG through "igLoopBackEdge". + // + // During emitter, this information will be used to calculate the loop size. + // Depending on the loop size, decision of whether to align a loop or not will be taken. + + if (block->bbJumpDest->isLoopAlign()) + { + GetEmitter()->emitSetLoopBackEdge(block->bbJumpDest); + } +#endif + break; + default: noway_assert(!"Unexpected bbJumpKind"); break; } +#if FEATURE_LOOP_ALIGN + + // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN), + // then need to add align instruction in current "block". Also mark the + // corresponding IG with IGF_LOOP_ALIGN to know that there will be align + // instructions at the end of that IG. + // + // For non-adaptive alignment, add alignment instruction of size depending on the + // compJitAlignLoopBoundary. + // For adaptive alignment, alignment instruction will always be of 15 bytes. + + if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) + { + assert(ShouldAlignLoops()); + + GetEmitter()->emitLoopAlignment(); + } +#endif + #if defined(DEBUG) && defined(USING_VARIABLE_LIVE_RANGE) if (compiler->verbose) { diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index b25afff5a6fdaa..7f53629f25496f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2308,7 +2308,7 @@ void Compiler::compSetProcessor() opts.compUseCMOV = jitFlags.IsSet(JitFlags::JIT_FLAG_USE_CMOV); #ifdef DEBUG if (opts.compUseCMOV) - opts.compUseCMOV = !compStressCompile(STRESS_USE_CMOV, 50); + opts.compUseCMOV = !compStressCompile(STRESS_USE_CMOV, 50); #endif // DEBUG #endif // TARGET_X86 @@ -2615,6 +2615,29 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compDbgInfo = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_INFO); opts.compDbgEnC = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC); +#ifdef DEBUG + opts.compJitAlignLoopAdaptive = JitConfig.JitAlignLoopAdaptive() == 1; + opts.compJitAlignLoopBoundary = (unsigned short)JitConfig.JitAlignLoopBoundary(); + opts.compJitAlignLoopMinBlockWeight = (unsigned short)JitConfig.JitAlignLoopMinBlockWeight(); + + opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1; + opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize(); +#else + opts.compJitAlignLoopAdaptive = true; + opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY; + opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT; +#endif + if (opts.compJitAlignLoopAdaptive) + { + opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1) - 1; + } + else + { + opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary - 1; + } + + assert(isPow2(opts.compJitAlignLoopBoundary)); + #if REGEN_SHORTCUTS || REGEN_CALLPAT // We never want to have debugging enabled when regenerating GC encoding patterns opts.compDbgCode = false; @@ -3913,19 +3936,17 @@ void Compiler::compSetOptimizationLevel() codeGen->setFrameRequired(true); #endif - if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_RELOC)) + if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT)) { - codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code - - // The zapper doesn't set JitFlags::JIT_FLAG_ALIGN_LOOPS, and there is - // no reason for it to set it as the JIT doesn't currently support loop alignment - // for prejitted images. (The JIT doesn't know the final address of the code, hence + // The JIT doesn't currently support loop alignment for prejitted images. + // (The JIT doesn't know the final address of the code, hence // it can't align code based on unknown addresses.) - assert(!opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS)); + + codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code } else { - codeGen->SetAlignLoops(opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS)); + codeGen->SetAlignLoops(JitConfig.JitAlignLoops() == 1); } } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 1cdebfb9c3c8aa..9af31fdf03a071 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6367,6 +6367,8 @@ class Compiler void optFindNaturalLoops(); + void optIdentifyLoopsForAlignment(); + // Ensures that all the loops in the loop nest rooted at "loopInd" (an index into the loop table) are 'canonical' -- // each loop has a unique "top." Returns "true" iff the flowgraph has been modified. bool optCanonicalizeLoopNest(unsigned char loopInd); @@ -9036,6 +9038,43 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool dspGCtbls; // Display the GC tables #endif +// Default numbers used to perform loop alignment. All the numbers are choosen +// based on experimenting with various benchmarks. + +// Default minimum loop block weight required to enable loop alignment. +#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 4 + +// By default a loop will be aligned at 32B address boundary to get better +// performance as per architecture manuals. +#define DEFAULT_ALIGN_LOOP_BOUNDARY 0x20 + +// For non-adaptive loop alignment, by default, only align a loop whose size is +// at most 3 times the alignment block size. If the loop is bigger than that, it is most +// likely complicated enough that loop alignment will not impact performance. +#define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3 + +#ifdef DEBUG + // Loop alignment variables + + // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary. + bool compJitAlignLoopForJcc; +#endif + // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done. + unsigned short compJitAlignLoopMaxCodeSize; + + // Minimum weight needed for the first block of a loop to make it a candidate for alignment. + unsigned short compJitAlignLoopMinBlockWeight; + + // For non-adaptive alignment, address boundary (power of 2) at which loop alignment should + // be done. By default, 32B. + unsigned short compJitAlignLoopBoundary; + + // Padding limit to align a loop. + unsigned short compJitAlignPaddingLimit; + + // If set, perform adaptive loop alignment that limits number of padding based on loop size. + bool compJitAlignLoopAdaptive; + #ifdef LATE_DISASM bool doLateDisasm; // Run the late disassembler #endif // LATE_DISASM diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 3056c71e6a0932..b42111611504d7 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -160,6 +160,8 @@ unsigned emitter::emitSmallCnsCnt; unsigned emitter::emitLargeCnsCnt; unsigned emitter::emitSmallCns[SMALL_CNS_TSZ]; +unsigned emitter::emitTotalDescAlignCnt; + void emitterStaticStats(FILE* fout) { // insGroup members @@ -387,6 +389,9 @@ void emitterStats(FILE* fout) fprintf(fout, "Total instrDescReloc: %8u (%5.2f%%)\n", emitter::emitTotalIDescRelocCnt, 100.0 * emitter::emitTotalIDescRelocCnt / emitter::emitTotalInsCnt); #endif // TARGET_ARM + fprintf(fout, "Total emitTotalDescAlignCnt: %8u (%5.2f%%)\n", emitter::emitTotalDescAlignCnt, + 100.0 * emitter::emitTotalDescAlignCnt / emitter::emitTotalInsCnt); + fprintf(fout, "\n"); } @@ -636,6 +641,10 @@ void emitter::emitGenIG(insGroup* ig) assert(emitCurIGjmpList == nullptr); +#if FEATURE_LOOP_ALIGN + assert(emitCurIGAlignList == nullptr); +#endif + /* Allocate the temp instruction buffer if we haven't done so */ if (emitCurIGfreeBase == nullptr) @@ -822,6 +831,60 @@ insGroup* emitter::emitSavIG(bool emitAdd) } #endif +#if FEATURE_LOOP_ALIGN + // Did we have any align instructions in this group? + if (emitCurIGAlignList) + { + instrDescAlign* list = nullptr; + instrDescAlign* last = nullptr; + + // Move align instructions to the global list, update their 'next' links + do + { + // Grab the jump and remove it from the list + + instrDescAlign* oa = emitCurIGAlignList; + emitCurIGAlignList = oa->idaNext; + + // Figure out the address of where the align got copied + + size_t of = (BYTE*)oa - emitCurIGfreeBase; + instrDescAlign* na = (instrDescAlign*)(ig->igData + of); + + assert(na->idaIG == ig); + assert(na->idIns() == oa->idIns()); + assert(na->idaNext == oa->idaNext); + assert(na->idIns() == INS_align); + + na->idaNext = list; + list = na; + + if (last == nullptr) + { + last = na; + } + } while (emitCurIGAlignList); + + // Should have at least one align instruction + assert(last); + + if (emitAlignList == nullptr) + { + assert(emitAlignLast == nullptr); + + last->idaNext = emitAlignList; + emitAlignList = list; + } + else + { + last->idaNext = nullptr; + emitAlignLast->idaNext = list; + } + + emitAlignLast = last; + } + +#endif // Did we have any jumps in this group? if (emitCurIGjmpList) @@ -933,6 +996,12 @@ void emitter::emitBegFN(bool hasFramePtr emitCurIGfreeBase = nullptr; emitIGbuffSize = 0; +#if FEATURE_LOOP_ALIGN + emitLastAlignedIgNum = 0; + emitLastInnerLoopStartIgNum = 0; + emitLastInnerLoopEndIgNum = 0; +#endif + /* Record stack frame info (the temp size is just an estimate) */ emitHasFramePtr = hasFramePtr; @@ -968,6 +1037,13 @@ void emitter::emitBegFN(bool hasFramePtr emitNoGCIG = false; emitForceNewIG = false; +#if FEATURE_LOOP_ALIGN + /* We don't have any align instructions */ + + emitAlignList = emitAlignLast = nullptr; + emitCurIGAlignList = nullptr; +#endif + /* We have not recorded any live sets */ assert(VarSetOps::IsEmpty(emitComp, emitThisGCrefVars)); @@ -3613,6 +3689,10 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) { size_t is; +#ifdef DEBUG + size_t beforeAddr = (size_t)*dp; +#endif + /* Record the beginning offset of the instruction */ BYTE* curInsAdr = *dp; @@ -3647,17 +3727,23 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) /* Did the size of the instruction match our expectations? */ - UNATIVE_OFFSET csz = (UNATIVE_OFFSET)(*dp - curInsAdr); + UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr); - if (csz != id->idCodeSize()) + unsigned estimatedSize = id->idCodeSize(); + if (actualSize != estimatedSize) { - /* It is fatal to under-estimate the instruction size */ - noway_assert(id->idCodeSize() >= csz); + // It is fatal to under-estimate the instruction size, except for alignment instructions + noway_assert(estimatedSize >= actualSize); + +#if FEATURE_LOOP_ALIGN + // Should never over-estimate align instruction or any instruction before the last align instruction of a method + assert(id->idIns() != INS_align && emitCurIG->igNum > emitLastAlignedIgNum); +#endif #if DEBUG_EMIT if (EMITVERBOSE) { - printf("Instruction predicted size = %u, actual = %u\n", id->idCodeSize(), csz); + printf("Instruction predicted size = %u, actual = %u\n", estimatedSize, actualSize); } #endif // DEBUG_EMIT @@ -3665,7 +3751,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) ig->igFlags |= IGF_UPD_ISZ; #if defined(TARGET_XARCH) - id->idCodeSize(csz); + id->idCodeSize(actualSize); #elif defined(TARGET_ARM) // This is done as part of emitSetShortJump(); // insSize isz = emitInsSize(id->idInsFmt()); @@ -3684,6 +3770,51 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id)); assert(is == emitSizeOfInsDsc(id)); } + + // Print the alignment boundary + if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr) + { + size_t currAddr = (size_t)*dp; + size_t lastBoundaryAddr = currAddr & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1); + + // draw boundary if beforeAddr was before the lastBoundary. + if (beforeAddr < lastBoundaryAddr) + { + printf("; "); + instruction currIns = id->idIns(); + +#if defined(TARGET_XARCH) + + // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf + bool isJccAffectedIns = + ((currIns >= INS_i_jmp && currIns < INS_align) || (currIns == INS_call) || (currIns == INS_ret)); + + instrDesc* nextId = id; + castto(nextId, BYTE*) += is; + instruction nextIns = nextId->idIns(); + if ((currIns == INS_cmp) || (currIns == INS_test) || (currIns == INS_add) || (currIns == INS_sub) || + (currIns == INS_and) || (currIns == INS_inc) || (currIns == INS_dec)) + { + isJccAffectedIns |= (nextIns >= INS_i_jmp && nextIns < INS_align); + } +#else + bool isJccAffectedIns = false; +#endif + + // Indicate if instruction is at at 32B boundary or is splitted + unsigned bytesCrossedBoundary = (currAddr & (emitComp->opts.compJitAlignLoopBoundary - 1)); + if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0)) + { + printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(id->idIns()), + bytesCrossedBoundary); + } + else + { + printf("..............................."); + } + printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary)); + } + } #endif return is; @@ -4479,6 +4610,428 @@ void emitter::emitJumpDistBind() #endif // DEBUG } +#if FEATURE_LOOP_ALIGN + +//----------------------------------------------------------------------------- +// emitLoopAlignment: Insert an align instruction at the end of emitCurIG and +// mark it as IGF_LOOP_ALIGN to indicate that next IG is a +// loop needing alignment. +// +void emitter::emitLoopAlignment() +{ + if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive)) + { + emitLongLoopAlign(emitComp->opts.compJitAlignLoopBoundary); + } + else + { + emitLoopAlign(); + } + + // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of + // all IGs that follows this IG and participate in a loop. + emitCurIG->igFlags |= IGF_LOOP_ALIGN; + + JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u.\n", emitComp->opts.compJitAlignLoopBoundary, + emitComp->compMethodID, emitCurIG->igNum); +} + +//----------------------------------------------------------------------------- +// emitEndsWithAlignInstr: Checks if current IG ends with loop align instruction. +// +// Returns: true if current IG ends with align instruciton. +// +bool emitter::emitEndsWithAlignInstr() +{ + return emitCurIG->isLoopAlign(); +} + +//----------------------------------------------------------------------------- +// getLoopSize: Starting from loopHeaderIg, find the size of the smallest possible loop +// such that it doesn't exceed the maxLoopSize. +// +// Arguments: +// igLoopHeader - The header IG of a loop +// maxLoopSize - Maximum loop size. If the loop is bigger than this value, we will just +// return this value. +// +// Returns: size of a loop in bytes. +// +unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize) +{ + unsigned loopSize = 0; + + for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext) + { + loopSize += igInLoop->igSize; + if (igInLoop->isLoopAlign()) + { + // If igInLoop's next IG is a loop and needs alignment, then igInLoop should be the last IG + // of the current loop and should have backedge to current loop header. + assert(igInLoop->igLoopBackEdge == igLoopHeader); + + // In such cases, the current loop size should exclude the align instruction size reserved for + // next loop. + loopSize -= emitComp->opts.compJitAlignPaddingLimit; + } + if ((igInLoop->igLoopBackEdge == igLoopHeader) || (loopSize > maxLoopSize)) + { + break; + } + } + + return loopSize; +} + +//----------------------------------------------------------------------------- +// emitSetLoopBackEdge : Sets igLoopBackEdge field, if not already set and +// if currIG has back-edge to dstIG. +// +// Notes: +// If the current loop encloses a loop that is already marked as align, then remove +// the alignment flag present on IG before dstIG. +// +void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) +{ + insGroup* dstIG = (insGroup*)loopTopBlock->bbEmitCookie; + + // With (dstIG != nullptr), ensure that only back edges are tracked. + // If there is forward jump, dstIG is not yet generated. + // + // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic + // block numbering is not guaranteed to be sequential. + + if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum)) + { + unsigned currLoopStart = dstIG->igNum; + unsigned currLoopEnd = emitCurIG->igNum; + + // Only mark back-edge if current loop starts after the last inner loop ended. + if (emitLastInnerLoopEndIgNum < currLoopStart) + { + emitCurIG->igLoopBackEdge = dstIG; + + JITDUMP("** IG%02u jumps back to IG%02u forming a loop.\n", currLoopEnd, currLoopStart); + + emitLastInnerLoopStartIgNum = currLoopStart; + emitLastInnerLoopEndIgNum = currLoopEnd; + } + // Otherwise, mark the dstIG->prevIG as no alignment needed. + // + // Note: If current loop's back-edge target is same as emitLastInnerLoopStartIgNum, + // retain the alignment flag of dstIG->prevIG so the loop + // (emitLastInnerLoopStartIgNum ~ emitLastInnerLoopEndIgNum) is still aligned. + else if (emitLastInnerLoopStartIgNum != currLoopStart) + { + // Find the IG before dstIG... + instrDescAlign* alignInstr = emitAlignList; + while ((alignInstr != nullptr) && (alignInstr->idaIG->igNext != dstIG)) + { + alignInstr = alignInstr->idaNext; + } + + // ...and clear the IGF_LOOP_ALIGN flag + if (alignInstr != nullptr) + { + assert(alignInstr->idaIG->igNext == dstIG); + alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN; + } + + JITDUMP( + "** Skip alignment for loop IG%02u ~ IG%02u, because it encloses an aligned loop IG%02u ~ IG%02u.\n", + currLoopStart, currLoopEnd, emitLastInnerLoopStartIgNum, emitLastInnerLoopEndIgNum); + } + } +} + +//----------------------------------------------------------------------------- +// emitLoopAlignAdjustments: Walk all the align instructions and update them +// with actual padding needed. + +// Notes: +// For IGs that have align instructions in the end, calculate the actual offset +// of loop start and determine how much padding is needed. Based on that, update +// the igOffs, igSize and emitTotalCodeSize. +// +void emitter::emitLoopAlignAdjustments() +{ + // no align instructions + if (emitAlignList == nullptr) + { + return; + } + + JITDUMP("*************** In emitLoopAlignAdjustments()\n"); + + unsigned short estimatedPaddingNeeded = emitComp->opts.compJitAlignPaddingLimit; + unsigned short alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + // For adaptive, adjust the loop size depending on the alignment boundary + int maxBlocksAllowedForLoop = genLog2((unsigned)alignmentBoundary) - 1; + } + + unsigned alignBytesRemoved = 0; + unsigned loopSize = 0; + unsigned loopIGOffset = 0; + instrDescAlign* alignInstr = emitAlignList; + + for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext) + { + assert(alignInstr->idIns() == INS_align); + + insGroup* alignIG = alignInstr->idaIG; + + loopIGOffset = alignIG->igOffs + alignIG->igSize; + + // igSize also includes INS_align instruction, take it off. + loopIGOffset -= estimatedPaddingNeeded; + + // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected + // that the igLoopBackEdge encloses an IG that is marked for alignment. + unsigned actualPaddingNeeded = + alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0; + + assert(estimatedPaddingNeeded >= actualPaddingNeeded); + + unsigned short diff = (unsigned short)(estimatedPaddingNeeded - actualPaddingNeeded); + + if (diff != 0) + { + alignIG->igSize -= diff; + alignBytesRemoved += diff; + emitTotalCodeSize -= diff; + + // Update the flags + alignIG->igFlags |= IGF_UPD_ISZ; + if (actualPaddingNeeded == 0) + { + alignIG->igFlags &= ~IGF_LOOP_ALIGN; + } + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + assert(actualPaddingNeeded < MAX_ENCODED_SIZE); + alignInstr->idCodeSize(actualPaddingNeeded); + } + else + { + unsigned paddingToAdj = actualPaddingNeeded; + +#ifdef DEBUG + + int instrAdjusted = (alignmentBoundary + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; +#endif + // Adjust the padding amount in all align instructions in this IG + instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr; + for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG; + alignInstrToAdj = alignInstrToAdj->idaNext) + { + unsigned newPadding = min(paddingToAdj, MAX_ENCODED_SIZE); + alignInstrToAdj->idCodeSize(newPadding); + paddingToAdj -= newPadding; + prevAlignInstr = alignInstrToAdj; +#ifdef DEBUG + instrAdjusted--; +#endif + } + assert(paddingToAdj == 0); + assert(instrAdjusted == 0); + + // fast forward the align instruction to next IG + alignInstr = prevAlignInstr; + } + + JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum, + estimatedPaddingNeeded, actualPaddingNeeded); + } + + // Adjust the offset of all IGs starting from next IG until we reach the IG having the next + // align instruction or the end of IG list. + insGroup* adjOffIG = alignIG->igNext; + insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast; + while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum)) + { + adjOffIG->igOffs -= alignBytesRemoved; + adjOffIG = adjOffIG->igNext; + } + + if (actualPaddingNeeded > 0) + { + // Record the last IG that has align instruction. No overestimation + // adjustment will be done after emitLastAlignedIgNum. + emitLastAlignedIgNum = alignIG->igNum; + } + } + +#ifdef DEBUG + emitCheckIGoffsets(); +#endif +} + +//----------------------------------------------------------------------------- +// emitCalculatePaddingForLoopAlignment: Calculate the padding to insert at the +// end of 'ig' so the loop that starts after 'ig' is aligned. +// +// Returns: Padding amount. +// 0 means no padding is needed, either because loop is already aligned or it +// is too expensive to align loop and hence it will not be aligned. +// +// Notes: +// Below are the steps (in this order) to calculate the padding amount. +// 1. If loop is already aligned to desired boundary, then return 0. // already aligned +// 2. If loop size exceed maximum allowed loop size, then return 0. // already aligned +// +// For adaptive loop alignment: +// 3a. Calculate paddingNeeded and maxPaddingAmount to align to 32B boundary. +// 3b. If paddingNeeded > maxPaddingAmount, then recalculate to align to 16B boundary. +// 3b. If paddingNeeded == 0, then return 0. // already aligned at 16B +// 3c. If paddingNeeded > maxPaddingAmount, then return 0. // expensive to align +// 3d. If the loop already fits in minimum 32B blocks, then return 0. // already best aligned +// 3e. return paddingNeeded. +// +// For non-adaptive loop alignment: +// 3a. Calculate paddingNeeded. +// 3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned +// 3c. return paddingNeeded. +// +unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, + size_t offset DEBUG_ARG(bool displayAlignmentDetails)) +{ + assert(ig->isLoopAlign()); + unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; + + // No padding if loop is already aligned + if ((offset & (alignmentBoundary - 1)) == 0) + { + JITDUMP(";; Skip alignment: 'Loop already aligned at %dB boundary.'\n", alignmentBoundary); + return 0; + } + + unsigned maxLoopSize = 0; + int maxLoopBlocksAllowed = 0; + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + // For adaptive, adjust the loop size depending on the alignment boundary + maxLoopBlocksAllowed = genLog2((unsigned)alignmentBoundary) - 1; + maxLoopSize = alignmentBoundary * maxLoopBlocksAllowed; + } + else + { + // For non-adaptive, just take whatever is supplied using COMPlus_ variables + maxLoopSize = emitComp->opts.compJitAlignLoopMaxCodeSize; + } + + unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize); + + // No padding if loop is big + if (loopSize > maxLoopSize) + { + JITDUMP(";; Skip alignment: 'Loop is big. LoopSize= %d, MaxLoopSize= %d.'\n", alignmentBoundary, loopSize, + maxLoopSize); + return 0; + } + + unsigned paddingToAdd = 0; + unsigned minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary; + bool skipPadding = false; + + if (emitComp->opts.compJitAlignLoopAdaptive) + { + // adaptive loop alignment + unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1; + unsigned nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1); + + // Check if the alignment exceeds maxPadding limit + if (nPaddingBytes > nMaxPaddingBytes) + { + // Cannot align to 32B, so try to align to 16B boundary. + alignmentBoundary >>= 1; + nMaxPaddingBytes = 1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1); + nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1); + + // Check if the loop is already at new alignment boundary + if (nPaddingBytes == 0) + { + skipPadding = true; + JITDUMP(";; Skip alignment: 'Loop already aligned at 16B boundary.'\n"); + } + // Check if the alignment exceeds new maxPadding limit + else if (nPaddingBytes > nMaxPaddingBytes) + { + skipPadding = true; + JITDUMP(";; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " + "AlignmentBoundary= %dB.'\n", + nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); + } + } + + // If within maxPaddingLimit + if (!skipPadding) + { + // Padding is needed only if loop starts at or after the current offset. + // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. + size_t extraBytesNotInLoop = + (size_t)(emitComp->opts.compJitAlignLoopBoundary * minBlocksNeededForLoop) - loopSize; + size_t currentOffset = (size_t)offset % alignmentBoundary; + + if (currentOffset > extraBytesNotInLoop) + { + // Padding is needed only if loop starts at or after the current offset and hence might not + // fit in minBlocksNeededForLoop + paddingToAdd = nPaddingBytes; + } + else + { + // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. + JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n", + minBlocksNeededForLoop, alignmentBoundary); + } + } + } + else + { + // non-adaptive loop alignment + unsigned extraBytesNotInLoop = (alignmentBoundary * minBlocksNeededForLoop) - loopSize; + unsigned currentOffset = (size_t)offset % alignmentBoundary; + +#ifdef DEBUG + // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary + if (emitComp->opts.compJitAlignLoopForJcc) + { + // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing? + currentOffset++; + } +#endif + + if (currentOffset > extraBytesNotInLoop) + { + // Padding is needed only if loop starts at or after the current offset and hence might not + // fit in minBlocksNeededForLoop + paddingToAdd = (-(int)(size_t)offset) & (alignmentBoundary - 1); + } + else + { + // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. + JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n", minBlocksNeededForLoop, + alignmentBoundary); + } + } + + JITDUMP(";; Calculated padding to add %d bytes to align at %dB boundary that starts at 0x%x.'\n", paddingToAdd, + alignmentBoundary, offset); + + // Either no padding is added because it is too expensive or the offset gets aligned + // to the alignment boundary + assert(paddingToAdd == 0 || (((offset + paddingToAdd) & (alignmentBoundary - 1)) == 0)); + + return paddingToAdd; +} + +#endif + void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG) { #ifdef DEBUG @@ -4841,6 +5394,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, (void**)&codeBlock, (void**)&coldCodeBlock, (void**)&consBlock); #endif +#ifdef DEBUG + if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) + { + assert(((size_t)codeBlock & 31) == 0); + } +#endif + // if (emitConsDsc.dsdOffs) // printf("Cons=%08X\n", consBlock); @@ -5374,14 +5934,10 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, { printf("\n"); } - - if (emitComp->verbose) - { - printf("Allocated method code size = %4u , actual size = %4u\n", emitTotalCodeSize, cp - codeBlock); - } #endif unsigned actualCodeSize = emitCurCodeOffs(cp); + assert(emitTotalCodeSize >= actualCodeSize); #if EMITTER_STATS totAllocdSize += emitTotalCodeSize; @@ -5391,7 +5947,11 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, // Fill in eventual unused space, but do not report this space as used. // If you add this padding during the emitIGlist loop, then it will // emit offsets after the loop with wrong value (for example for GC ref variables). - unsigned unusedSize = emitTotalCodeSize - emitCurCodeOffs(cp); + unsigned unusedSize = emitTotalCodeSize - actualCodeSize; + + JITDUMP("Allocated method code size = %4u , actual size = %4u, unused size = %4u\n", emitTotalCodeSize, + actualCodeSize, unusedSize); + for (unsigned i = 0; i < unusedSize; ++i) { *cp++ = DEFAULT_CODE_BUFFER_INIT; @@ -7215,6 +7775,10 @@ void emitter::emitInitIG(insGroup* ig) ig->igSize = 0; ig->igGCregs = RBM_NONE; ig->igInsCnt = 0; + +#if FEATURE_LOOP_ALIGN + ig->igLoopBackEdge = nullptr; +#endif } /***************************************************************************** diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 0942a2df4ad93d..8030cc4b0fb16d 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -251,6 +251,10 @@ struct insGroup unsigned short igFlags; // see IGF_xxx below unsigned short igSize; // # of bytes of code in this group +#if FEATURE_LOOP_ALIGN + insGroup* igLoopBackEdge; // "last" back-edge that branches back to an aligned loop head. +#endif + #define IGF_GC_VARS 0x0001 // new set of live GC ref variables #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers #if defined(FEATURE_EH_FUNCLETS) && defined(TARGET_ARM) @@ -264,6 +268,8 @@ struct insGroup #define IGF_PLACEHOLDER 0x0100 // this is a placeholder group, to be filled in later #define IGF_EXTEND 0x0200 // this block is conceptually an extension of the previous block // and the emitter should continue to track GC info as if there was no new block. +#define IGF_LOOP_ALIGN 0x0400 // this group contains alignment instruction(s) at the end; the next IG is the + // head of a loop that needs alignment. // Mask of IGF_* flags that should be propagated to new blocks when they are created. // This allows prologs and epilogs to be any number of IGs, but still be @@ -336,6 +342,11 @@ struct insGroup return *(unsigned*)ptr; } + bool isLoopAlign() + { + return (igFlags & IGF_LOOP_ALIGN) != 0; + } + }; // end of struct insGroup // For AMD64 the maximum prolog/epilog size supported on the OS is 256 bytes @@ -561,6 +572,7 @@ class emitter #if defined(TARGET_XARCH) static_assert_no_msg(INS_count <= 1024); instruction _idIns : 10; +#define MAX_ENCODED_SIZE 15 #elif defined(TARGET_ARM64) static_assert_no_msg(INS_count <= 512); instruction _idIns : 9; @@ -1361,6 +1373,14 @@ class emitter // hot to cold and cold to hot jumps) }; +#if FEATURE_LOOP_ALIGN + struct instrDescAlign : instrDesc + { + instrDescAlign* idaNext; // next align in the group/method + insGroup* idaIG; // containing group + }; +#endif + #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT. struct instrDescLbl : instrDescJmp { @@ -1738,6 +1758,21 @@ class emitter instrDescJmp* emitJumpLast; // last of local jumps in method void emitJumpDistBind(); // Bind all the local jumps in method +#if FEATURE_LOOP_ALIGN + instrDescAlign* emitCurIGAlignList; // list of align instructions in current IG + unsigned emitLastInnerLoopStartIgNum; // Start IG of last inner loop + unsigned emitLastInnerLoopEndIgNum; // End IG of last inner loop + unsigned emitLastAlignedIgNum; // last IG that has align instruction + instrDescAlign* emitAlignList; // list of local align instructions in method + instrDescAlign* emitAlignLast; // last align instruction in method + unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size + void emitLoopAlignment(); + bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate + void emitSetLoopBackEdge(BasicBlock* loopTopBlock); + void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments + unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails)); +#endif + void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets bool emitFwdJumps; // forward jumps present? @@ -1903,7 +1938,7 @@ class emitter instrDescJmp* emitAllocInstrJmp() { #if EMITTER_STATS - emitTotalIDescJmpCnt++; + emitTotalDescAlignCnt++; #endif // EMITTER_STATS return (instrDescJmp*)emitAllocAnyInstr(sizeof(instrDescJmp), EA_1BYTE); } @@ -1978,6 +2013,17 @@ class emitter return (instrDescCGCA*)emitAllocAnyInstr(sizeof(instrDescCGCA), attr); } +#if FEATURE_LOOP_ALIGN + instrDescAlign* emitAllocInstrAlign() + { +#if EMITTER_STATS + emitTotalIDescJmpCnt++; +#endif // EMITTER_STATS + return (instrDescAlign*)emitAllocAnyInstr(sizeof(instrDescAlign), EA_1BYTE); + } + instrDescAlign* emitNewInstrAlign(); +#endif + instrDesc* emitNewInstrSmall(emitAttr attr); instrDesc* emitNewInstr(emitAttr attr = EA_4BYTE); instrDesc* emitNewInstrSC(emitAttr attr, cnsval_ssize_t cns); @@ -2299,6 +2345,7 @@ class emitter #define SMALL_CNS_TSZ 256 static unsigned emitSmallCns[SMALL_CNS_TSZ]; static unsigned emitLargeCnsCnt; + static unsigned emitTotalDescAlignCnt; static unsigned emitIFcounts[IF_COUNT]; @@ -2501,6 +2548,15 @@ inline emitter::instrDescJmp* emitter::emitNewInstrJmp() return emitAllocInstrJmp(); } +#if FEATURE_LOOP_ALIGN +inline emitter::instrDescAlign* emitter::emitNewInstrAlign() +{ + instrDescAlign* newInstr = emitAllocInstrAlign(); + newInstr->idIns(INS_align); + return newInstr; +} +#endif + #if !defined(TARGET_ARM64) inline emitter::instrDescLbl* emitter::emitNewInstrLbl() { diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e91f0cf6d55c95..b6ca4dd7030a3e 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -874,9 +874,16 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c // * W must be unset (0x00 validates bit 7) if ((vexPrefix & 0xFFFF7F80) == 0x00C46100) { - emitOutputByte(dst, 0xC5); - emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F)); - return 2; + // Encoding optimization calculation is not done while estimating the instruction + // size and thus over-predict instruction size by 1 byte. + // If there are IGs that will be aligned, do not optimize encoding so the + // estimated alignment sizes are accurate. + if (emitCurIG->igNum > emitLastAlignedIgNum) + { + emitOutputByte(dst, 0xC5); + emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F)); + return 2; + } } emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF)); @@ -2651,22 +2658,62 @@ emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int } } -/***************************************************************************** - * - * The next instruction will be a loop head entry point - * So insert a dummy instruction here to ensure that - * the x86 I-cache alignment rule is followed. - */ - -void emitter::emitLoopAlign() +//----------------------------------------------------------------------------- +// +// The next instruction will be a loop head entry point +// So insert an alignment instruction here to ensure that +// we can properly align the code. +// +void emitter::emitLoopAlign(unsigned short paddingBytes) { /* Insert a pseudo-instruction to ensure that we align the next instruction properly */ - instrDesc* id = emitNewInstrSmall(EA_1BYTE); - id->idIns(INS_align); - id->idCodeSize(15); // We may need to skip up to 15 bytes of code - emitCurIGsize += 15; + assert(paddingBytes <= MAX_ENCODED_SIZE); + paddingBytes = min(paddingBytes, MAX_ENCODED_SIZE); // We may need to skip up to 15 bytes of code + instrDescAlign* id = emitNewInstrAlign(); + id->idCodeSize(paddingBytes); + emitCurIGsize += paddingBytes; + + id->idaIG = emitCurIG; + + /* Append this instruction to this IG's alignment list */ + id->idaNext = emitCurIGAlignList; + emitCurIGAlignList = id; +} + +//----------------------------------------------------------------------------- +// +// The next instruction will be a loop head entry point +// So insert alignment instruction(s) here to ensure that +// we can properly align the code. +// +// This emits more than one `INS_align` instruction depending on the +// alignmentBoundary parameter. +// +void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) +{ + unsigned short nPaddingBytes = alignmentBoundary - 1; + unsigned short nAlignInstr = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; + unsigned short instrDescSize = nAlignInstr * sizeof(instrDescAlign); + unsigned short insAlignCount = nPaddingBytes / MAX_ENCODED_SIZE; + unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE; + + // Ensure that all align instructions fall in same IG. + if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) + { + emitForceNewIG = true; + } + + /* Insert a pseudo-instruction to ensure that we align + the next instruction properly */ + + while (insAlignCount) + { + emitLoopAlign(); + insAlignCount--; + } + emitLoopAlign(lastInsAlignSize); } /***************************************************************************** @@ -2676,7 +2723,7 @@ void emitter::emitLoopAlign() void emitter::emitIns_Nop(unsigned size) { - assert(size <= 15); + assert(size <= MAX_ENCODED_SIZE); instrDesc* id = emitNewInstr(); id->idIns(INS_nop); @@ -7341,6 +7388,12 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id) switch (idOp) { case ID_OP_NONE: +#if FEATURE_LOOP_ALIGN + if (id->idIns() == INS_align) + { + return sizeof(instrDescAlign); + } +#endif break; case ID_OP_LBL: @@ -9325,6 +9378,49 @@ static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes) return dst; } +//-------------------------------------------------------------------- +// emitOutputAlign: Outputs NOP to align the loop +// +// Arguments: +// ig - Current instruction group +// id - align instruction that holds amount of padding (NOPs) to add +// dst - Destination buffer +// +// Return Value: +// None. +// +// Notes: +// Amount of padding needed to align the loop is already calculated. This +// method extracts that information and inserts suitable NOP instructions. +// +BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) +{ + // Candidate for loop alignment + assert(codeGen->ShouldAlignLoops()); + assert(ig->isLoopAlign()); + + unsigned paddingToAdd = id->idCodeSize(); + + // Either things are already aligned or align them here. + assert((paddingToAdd == 0) || (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0)); + + // Padding amount should not exceed the alignment boundary + assert(0 <= paddingToAdd && paddingToAdd < emitComp->opts.compJitAlignLoopBoundary); + +#ifdef DEBUG + bool displayAlignmentDetails = (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose; + unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, displayAlignmentDetails); + + // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking + if (emitComp->opts.compJitAlignLoopAdaptive) + { + assert(paddingToAdd == paddingNeeded); + } +#endif + + return emitOutputNOP(dst, paddingToAdd); +} + /***************************************************************************** * * Output an instruction involving an address mode. @@ -12398,7 +12494,8 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) #ifdef DEBUG if (emitComp->verbose) { - printf("; NOTE: size of jump [%08X] mis-predicted\n", emitComp->dspPtr(id)); + printf("; NOTE: size of jump [%08X] mis-predicted by %d bytes\n", emitComp->dspPtr(id), + (id->idCodeSize() - JMP_SIZE_SMALL)); } #endif } @@ -12559,10 +12656,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { assert(emitIssuing); - BYTE* dst = *dp; - size_t sz = sizeof(instrDesc); - instruction ins = id->idIns(); - unsigned char callInstrSize = 0; + BYTE* dst = *dp; + size_t sz = sizeof(instrDesc); + instruction ins = id->idIns(); + unsigned char callInstrSize = 0; + int emitOffsAdjBefore = emitOffsAdj; #ifdef DEBUG bool dspOffs = emitComp->opts.dspGCtbls; @@ -12598,9 +12696,21 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // the loop alignment pseudo instruction if (ins == INS_align) { - sz = SMALL_IDSC_SIZE; - dst = emitOutputNOP(dst, (-(int)(size_t)dst) & 0x0f); - assert(((size_t)dst & 0x0f) == 0); + sz = sizeof(instrDescAlign); + // IG can be marked as not needing alignment after emitting align instruction + // In such case, skip outputting alignment. + if (ig->isLoopAlign()) + { + dst = emitOutputAlign(ig, id, dst); + } +#ifdef DEBUG + else + { + // If the IG is not marked as need alignment, then the code size + // should be zero i.e. no padding needed. + assert(id->idCodeSize() == 0); + } +#endif break; } @@ -13704,7 +13814,49 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { emitDispIns(id, false, dspOffs, true, emitCurCodeOffs(*dp), *dp, (dst - *dp)); } +#endif +#if FEATURE_LOOP_ALIGN + // Only compensate over-estimated instructions if emitCurIG is before + // the last IG that needs alignment. + if (emitCurIG->igNum <= emitLastAlignedIgNum) + { + int diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp)); + assert(diff >= 0); + if (diff != 0) + { + +#ifdef DEBUG + // should never over-estimate align instruction + assert(id->idIns() != INS_align); + JITDUMP("Added over-estimation compensation: %d\n", diff); + + if (emitComp->opts.disAsm) + { + emitDispInsAddr(dst); + printf("\t\t ;; NOP compensation instructions of %d bytes.\n", diff); + } +#endif + + dst = emitOutputNOP(dst, diff); + + // since we compensated the over-estimation, revert the offsAdj that + // might have happened in the jump + if (emitOffsAdjBefore != emitOffsAdj) + { +#ifdef DEBUG + insFormat format = id->idInsFmt(); + assert((format == IF_LABEL) || (format == IF_RWR_LABEL) || (format == IF_SWR_LABEL)); + assert(diff == (emitOffsAdj - emitOffsAdjBefore)); +#endif + emitOffsAdj -= diff; + } + } + assert((id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp))) == 0); + } +#endif + +#ifdef DEBUG if (emitComp->compDebugBreak) { // set JitEmitPrintRefRegs=1 will print out emitThisGCrefRegs and emitThisByrefRegs diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index fb2aac2d30f0d9..b0a8327acedb69 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -50,6 +50,7 @@ UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val); UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code); UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code, int val); +BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst); BYTE* emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr); BYTE* emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr); BYTE* emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr); @@ -287,7 +288,9 @@ inline emitAttr emitDecodeScale(unsigned ensz) /************************************************************************/ public: -void emitLoopAlign(); +void emitLoopAlign(unsigned short paddingBytes = 15); + +void emitLongLoopAlign(unsigned short alignmentBoundary); void emitIns(instruction ins); diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index e70f192fd9500a..b0a49030dacd4a 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -9642,9 +9642,9 @@ BasicBlock* Compiler::fgSplitBlockAtEnd(BasicBlock* curr) newBlock->bbFlags = curr->bbFlags; // Remove flags that the new block can't have. - newBlock->bbFlags &= - ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL | BBF_JMP_TARGET | - BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET); + newBlock->bbFlags &= ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL | + BBF_JMP_TARGET | BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | + BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET | BBF_LOOP_ALIGN); // Remove the GC safe bit on the new block. It seems clear that if we split 'curr' at the end, // such that all the code is left in 'curr', and 'newBlock' just gets the control flow, then @@ -10946,6 +10946,18 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) break; } + // Add the LOOP_ALIGN flag + if (bNext->isLoopAlign()) + { + // Only if the new block is jump target or has label + if (((block->bbFlags & BBF_JMP_TARGET) != 0) || ((block->bbFlags & BBF_HAS_LABEL) != 0)) + { + block->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " during compacting.\n", bNext->bbNum, + block->bbNum); + } + } + // If we're collapsing a block created after the dominators are // computed, copy block number the block and reuse dominator // information from bNext to block. @@ -11536,6 +11548,14 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) if (block->isLoopHead() && (succBlock->bbNum <= block->bbNum)) { succBlock->bbFlags |= BBF_LOOP_HEAD; + + if (block->isLoopAlign()) + { + succBlock->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for loop# %d.", block->bbNum, + succBlock->bbNum, block->bbNatLoopNum); + } + if (fgDomsComputed && fgReachable(succBlock, block)) { /* Mark all the reachable blocks between 'succBlock' and 'block', excluding 'block' */ diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h index 9fb780dbd40c66..62e7ac8059b16d 100644 --- a/src/coreclr/jit/jit.h +++ b/src/coreclr/jit/jit.h @@ -747,6 +747,10 @@ class Histogram #define CLFLG_STRUCTPROMOTE 0x00000 #endif +#ifdef TARGET_XARCH +#define FEATURE_LOOP_ALIGN 1 +#endif + #define CLFLG_MAXOPT \ (CLFLG_CSE | CLFLG_REGVAR | CLFLG_RNGCHKOPT | CLFLG_DEADASGN | CLFLG_CODEMOTION | CLFLG_QMARK | CLFLG_TREETRANS | \ CLFLG_INLINING | CLFLG_STRUCTPROMOTE | CLFLG_CONSTANTFOLD) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 865ae3033f09aa..5ffab7c0f29e96 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -41,6 +41,27 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb // optimizations are performed on the fast path. CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra // with this byte. +CONFIG_INTEGER(JitAlignLoopMinBlockWeight, + W("JitAlignLoopMinBlockWeight"), + DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT) // Minimum weight needed for the first block of a loop to make it a + // candidate for alignment. +CONFIG_INTEGER(JitAlignLoopMaxCodeSize, + W("JitAlignLoopMaxCodeSize"), + DEFAULT_MAX_LOOPSIZE_FOR_ALIGN) // For non-adaptive alignment, minimum loop size (in bytes) for which + // alignment will be done. + // Defaults to 3 blocks of 32 bytes chunks = 96 bytes. +CONFIG_INTEGER(JitAlignLoopBoundary, + W("JitAlignLoopBoundary"), + DEFAULT_ALIGN_LOOP_BOUNDARY) // For non-adaptive alignment, address boundary (power of 2) at which loop + // alignment should be done. By default, 32B. +CONFIG_INTEGER(JitAlignLoopForJcc, + W("JitAlignLoopForJcc"), + 0) // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary. + +CONFIG_INTEGER(JitAlignLoopAdaptive, + W("JitAlignLoopAdaptive"), + 1) // If set, perform adaptive loop alignment that limits number of padding based on loop size. + CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0) CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1) CONFIG_INTEGER(JitDumpASCII, W("JitDumpASCII"), 1) // Uses only ASCII characters in tree dumps @@ -202,6 +223,12 @@ CONFIG_INTEGER(EnableIncompleteISAClass, W("EnableIncompleteISAClass"), 0) // En // intrinsic classes #endif // defined(DEBUG) +#if FEATURE_LOOP_ALIGN +CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 1) // If set, align inner loops +#else +CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 0) +#endif + /// /// JIT /// diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h index 298536138b2e1a..6301166e489c0f 100644 --- a/src/coreclr/jit/jitee.h +++ b/src/coreclr/jit/jitee.h @@ -63,45 +63,45 @@ class JitFlags JIT_FLAG_BBINSTR = 29, // Collect basic block profile information JIT_FLAG_BBOPT = 30, // Optimize method based on profile information JIT_FLAG_FRAMED = 31, // All methods have an EBP frame - JIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries + JIT_FLAG_UNUSED12 = 32, JIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0. (used by IL stubs) - JIT_FLAG_UNUSED12 = 34, + JIT_FLAG_UNUSED13 = 34, JIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background JIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions JIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog - JIT_FLAG_UNUSED13 = 38, + JIT_FLAG_UNUSED14 = 38, JIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible JIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code #if defined(TARGET_ARM) JIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records #else // !defined(TARGET_ARM) - JIT_FLAG_UNUSED14 = 41, + JIT_FLAG_UNUSED15 = 41, #endif // !defined(TARGET_ARM) JIT_FLAG_NO_INLINING = 42, // JIT should not inline any called method into this method - JIT_FLAG_UNUSED15 = 43, - JIT_FLAG_UNUSED16 = 44, - JIT_FLAG_UNUSED17 = 45, - JIT_FLAG_UNUSED18 = 46, - JIT_FLAG_UNUSED19 = 47, - JIT_FLAG_UNUSED20 = 48, - JIT_FLAG_UNUSED21 = 49, - JIT_FLAG_UNUSED22 = 50, - JIT_FLAG_UNUSED23 = 51, - JIT_FLAG_UNUSED24 = 52, - JIT_FLAG_UNUSED25 = 53, - JIT_FLAG_UNUSED26 = 54, - JIT_FLAG_UNUSED27 = 55, - JIT_FLAG_UNUSED28 = 56, - JIT_FLAG_UNUSED29 = 57, - JIT_FLAG_UNUSED30 = 58, - JIT_FLAG_UNUSED31 = 59, - JIT_FLAG_UNUSED32 = 60, - JIT_FLAG_UNUSED33 = 61, - JIT_FLAG_UNUSED34 = 62, - JIT_FLAG_UNUSED35 = 63 + JIT_FLAG_UNUSED16 = 43, + JIT_FLAG_UNUSED17 = 44, + JIT_FLAG_UNUSED18 = 45, + JIT_FLAG_UNUSED19 = 46, + JIT_FLAG_UNUSED20 = 47, + JIT_FLAG_UNUSED21 = 48, + JIT_FLAG_UNUSED22 = 49, + JIT_FLAG_UNUSED23 = 50, + JIT_FLAG_UNUSED24 = 51, + JIT_FLAG_UNUSED25 = 52, + JIT_FLAG_UNUSED26 = 53, + JIT_FLAG_UNUSED27 = 54, + JIT_FLAG_UNUSED28 = 55, + JIT_FLAG_UNUSED29 = 56, + JIT_FLAG_UNUSED30 = 57, + JIT_FLAG_UNUSED31 = 58, + JIT_FLAG_UNUSED32 = 59, + JIT_FLAG_UNUSED33 = 60, + JIT_FLAG_UNUSED34 = 61, + JIT_FLAG_UNUSED35 = 62, + JIT_FLAG_UNUSED36 = 63 }; // clang-format on @@ -201,7 +201,6 @@ class JitFlags FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR, JIT_FLAG_BBINSTR); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBOPT, JIT_FLAG_BBOPT); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_FRAMED, JIT_FLAG_FRAMED); - FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS, JIT_FLAG_ALIGN_LOOPS); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_PUBLISH_SECRET_PARAM, JIT_FLAG_PUBLISH_SECRET_PARAM); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_SAMPLING_JIT_BACKGROUND, JIT_FLAG_SAMPLING_JIT_BACKGROUND); FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_USE_PINVOKE_HELPERS, JIT_FLAG_USE_PINVOKE_HELPERS); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index b72d32e5ce2eab..e286bc26d3fba7 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -16317,6 +16317,12 @@ bool Compiler::fgFoldConditional(BasicBlock* block) * Remove the loop from the table */ optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED; +#if FEATURE_LOOP_ALIGN + optLoopTable[loopNum].lpFirst->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n", + optLoopTable[loopNum].lpFirst->bbNum); +#endif + #ifdef DEBUG if (verbose) { diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index e134915cfe9d39..ddadd938fcfc68 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2578,6 +2578,41 @@ void Compiler::optFindNaturalLoops() #endif // DEBUG } +//----------------------------------------------------------------------------- +// +// All the inner loops that whose block weight meets a threshold are marked +// as needing alignment. +// + +void Compiler::optIdentifyLoopsForAlignment() +{ +#if FEATURE_LOOP_ALIGN + if (codeGen->ShouldAlignLoops()) + { + for (unsigned char loopInd = 0; loopInd < optLoopCount; loopInd++) + { + BasicBlock* first = optLoopTable[loopInd].lpFirst; + + // An innerloop candidate that might need alignment + if (optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) + { + if (first->getBBWeight(this) >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT)) + { + first->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum, + first->getBBWeight(this)); + } + else + { + JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " weight=%f.\n", loopInd, first->bbNum, + first->getBBWeight(this)); + } + } + } + } +#endif +} + void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap) { BasicBlock* newJumpDest = nullptr; @@ -3757,6 +3792,22 @@ void Compiler::optUnrollLoops() #endif } +#if FEATURE_LOOP_ALIGN + for (block = head->bbNext;; block = block->bbNext) + { + if (block->isLoopAlign()) + { + block->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum); + } + + if (block == bottom) + { + break; + } + } +#endif + /* Create the unrolled loop statement list */ { BlockToBlockMap blockMap(getAllocator()); @@ -4506,6 +4557,10 @@ void Compiler::optOptimizeLoops() } } + // Check if any of the loops need alignment + + optIdentifyLoopsForAlignment(); + #if COUNT_LOOPS totalUnnatLoopCount += loopNum; #endif @@ -5146,9 +5201,10 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) { assert(loopInd < optLoopCount); - JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d].\n", loopInd, optLoopTable[loopInd].lpHead->bbNum, - optLoopTable[loopInd].lpFirst->bbNum, optLoopTable[loopInd].lpTop->bbNum, - optLoopTable[loopInd].lpEntry->bbNum, optLoopTable[loopInd].lpBottom->bbNum); + JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d, c: %d].\n", loopInd, + optLoopTable[loopInd].lpHead->bbNum, optLoopTable[loopInd].lpFirst->bbNum, + optLoopTable[loopInd].lpTop->bbNum, optLoopTable[loopInd].lpEntry->bbNum, + optLoopTable[loopInd].lpBottom->bbNum, optLoopTable[loopInd].lpChild); // Determine the depth of the loop, so we can properly weight blocks added (outside the cloned loop blocks). unsigned depth = optLoopDepth(loopInd); @@ -7975,6 +8031,20 @@ bool Compiler::optComputeLoopSideEffectsOfBlock(BasicBlock* blk) // Marks the containsCall information to "lnum" and any parent loops. void Compiler::AddContainsCallAllContainingLoops(unsigned lnum) { + +#if FEATURE_LOOP_ALIGN + // If this is the inner most loop, reset the LOOP_ALIGN flag + // because a loop having call will not likely to benefit from + // alignment + if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP) + { + BasicBlock* first = optLoopTable[lnum].lpFirst; + first->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag for L%02u that starts at " FMT_BB " because loop has a call.\n", lnum, + first->bbNum); + } +#endif + assert(0 <= lnum && lnum < optLoopCount); while (lnum != BasicBlock::NOT_IN_LOOP) { diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs index 79768f5fbdb9eb..1aadd4e2664542 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs @@ -1307,13 +1307,13 @@ public enum CorJitFlag : uint CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame - CORJIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries + CORJIT_FLAG_UNUSED8 = 32, CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0. (used by IL stubs) - CORJIT_FLAG_UNUSED8 = 34, + CORJIT_FLAG_UNUSED9 = 34, CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background CORJIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions CORJIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog - CORJIT_FLAG_UNUSED9 = 38, + CORJIT_FLAG_UNUSED10 = 38, CORJIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible CORJIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code CORJIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records diff --git a/src/coreclr/vm/eeconfig.cpp b/src/coreclr/vm/eeconfig.cpp index 389e4024e8c3bf..c1336060d21b71 100644 --- a/src/coreclr/vm/eeconfig.cpp +++ b/src/coreclr/vm/eeconfig.cpp @@ -118,7 +118,6 @@ HRESULT EEConfig::Init() iJitOptimizeType = OPT_DEFAULT; fJitFramed = false; - fJitAlignLoops = false; fJitMinOpts = false; fPInvokeRestoreEsp = (DWORD)-1; @@ -689,7 +688,6 @@ fTrackDynamicMethodDebugInfo = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_ dwJitHostMaxSlabCache = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_JitHostMaxSlabCache); fJitFramed = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitFramed, fJitFramed) != 0); - fJitAlignLoops = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitAlignLoops, fJitAlignLoops) != 0); fJitMinOpts = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JITMinOpts, fJitMinOpts) == 1); iJitOptimizeType = GetConfigDWORD_DontUse_(CLRConfig::EXTERNAL_JitOptimizeType, iJitOptimizeType); if (iJitOptimizeType > OPT_RANDOM) iJitOptimizeType = OPT_DEFAULT; diff --git a/src/coreclr/vm/eeconfig.h b/src/coreclr/vm/eeconfig.h index 46616fa1f5d002..a068e447117e18 100644 --- a/src/coreclr/vm/eeconfig.h +++ b/src/coreclr/vm/eeconfig.h @@ -75,7 +75,6 @@ class EEConfig bool GetTrackDynamicMethodDebugInfo(void) const {LIMITED_METHOD_CONTRACT; return fTrackDynamicMethodDebugInfo; } unsigned int GenOptimizeType(void) const {LIMITED_METHOD_CONTRACT; return iJitOptimizeType; } bool JitFramed(void) const {LIMITED_METHOD_CONTRACT; return fJitFramed; } - bool JitAlignLoops(void) const {LIMITED_METHOD_CONTRACT; return fJitAlignLoops; } bool JitMinOpts(void) const {LIMITED_METHOD_CONTRACT; return fJitMinOpts; } // Tiered Compilation config @@ -537,7 +536,6 @@ class EEConfig DWORD dwJitHostMaxSlabCache; // max size for jit host slab cache bool fTrackDynamicMethodDebugInfo; // Enable/Disable tracking dynamic method debug info bool fJitFramed; // Enable/Disable EBP based frames - bool fJitAlignLoops; // Enable/Disable loop alignment bool fJitMinOpts; // Enable MinOpts for all jitted methods unsigned iJitOptimizeType; // 0=Blended,1=SmallCode,2=FastCode, default is 0=Blended diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 0d60059283a3d0..aa60a55ceb3e27 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -12676,8 +12676,6 @@ CorJitResult CallCompileMethodWithSEHWrapper(EEJitManager *jitMgr, CORJIT_FLAGS flags; if (g_pConfig->JitFramed()) flags.Set(CORJIT_FLAGS::CORJIT_FLAG_FRAMED); - if (g_pConfig->JitAlignLoops()) - flags.Set(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS); #ifdef TARGET_X86 if (g_pConfig->PInvokeRestoreEsp(ftn->GetModule()->IsPreV4Assembly())) flags.Set(CORJIT_FLAGS::CORJIT_FLAG_PINVOKE_RESTORE_ESP);